Convert to monorepo.

This commit is contained in:
Ethiraric 2024-10-02 17:53:26 +02:00
commit 57d2ff4b19
59 changed files with 7678 additions and 1 deletions

3
bench/.cargo/config.toml Normal file
View file

@ -0,0 +1,3 @@
[alias]
gen_large_yaml = "run --profile=release-lto --package gen_large_yaml --bin gen_large_yaml --manifest-path tools/gen_large_yaml/Cargo.toml --"
bench_compare = "run --package bench_compare --bin bench_compare --manifest-path tools/bench_compare/Cargo.toml --"

1
bench/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

136
bench/Cargo.lock generated Normal file
View file

@ -0,0 +1,136 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "ahash"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
dependencies = [
"cfg-if",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "allocator-api2"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "arraydeque"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d902e3d592a523def97af8f317b08ce16b7ab854c1985a0c671e6f15cebc236"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "hashbrown"
version = "0.14.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
dependencies = [
"ahash",
"allocator-api2",
]
[[package]]
name = "hashlink"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
dependencies = [
"hashbrown",
]
[[package]]
name = "once_cell"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "proc-macro2"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
dependencies = [
"proc-macro2",
]
[[package]]
name = "saphyr-bench"
version = "0.0.1"
dependencies = [
"saphyr-parser",
]
[[package]]
name = "saphyr-parser"
version = "0.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "886b4bb040ecd2944f54c3543e612c336396e3eba700c5063d8bad5f40bac3d7"
dependencies = [
"arraydeque",
"hashlink",
]
[[package]]
name = "syn"
version = "2.0.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
name = "version_check"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "zerocopy"
version = "0.7.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
dependencies = [
"proc-macro2",
"quote",
"syn",
]

19
bench/Cargo.toml Normal file
View file

@ -0,0 +1,19 @@
[package]
name = "saphyr-bench"
version = "0.0.1"
authors = [ "Ethiraric <ethiraric@gmail.com>" ]
license = "MIT"
description = "Utilities to benchmark saphyr"
readme = "README.md"
edition = "2021"
[dependencies]
saphyr-parser = "0.0.1"
[[bin]]
name = "time_parse"
path = "tools/time_parse.rs"
[[bin]]
name = "run_bench"
path = "tools/run_bench.rs"

191
bench/LICENSE Normal file
View file

@ -0,0 +1,191 @@
Copyright (c) 2024 Ethiraric
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS

3
bench/README.md Normal file
View file

@ -0,0 +1,3 @@
# `saphyr-bench`
Set of utilities to benchmark the `saphyr` library.

14
bench/justfile Normal file
View file

@ -0,0 +1,14 @@
before_commit:
cargo clippy --release --all-targets -- -D warnings
cargo clippy --all-targets -- -D warnings
cargo build --release --all-targets
cargo build --all-targets
cargo build --profile=release-lto --package gen_large_yaml --bin gen_large_yaml --manifest-path tools/gen_large_yaml/Cargo.toml
cargo build --profile=release-lto --package bench_compare --bin bench_compare --manifest-path tools/bench_compare/Cargo.toml
ethi_bench:
cargo build --release --all-targets
cd ../Yaml-rust && cargo build --release --all-targets
cd ../serde-yaml/ && cargo build --release --all-targets
cd ../libfyaml/build && ninja
cargo bench_compare run_bench

2
bench/src/lib.rs Normal file
View file

@ -0,0 +1,2 @@

229
bench/tools/README.md Normal file
View file

@ -0,0 +1,229 @@
# `yaml-rust2` tools
This directory contains tools that are used to develop the crate.
Due to dependency management, only some of them are available as binaries from the `yaml-rust2` crate.
| Tool | Invocation |
|------|------------|
| `bench_compare` | `cargo bench_compare` |
| `dump_events` | `cargo run --bin dump_events -- [...]` |
| `gen_large_yaml` | `cargo gen_large_yaml` |
| `run_bench` | `cargo run --bin run_bench -- [...]` |
| `time_parse` | `cargo run --bin time_parse -- [...]` |
## `bench_compare`
See the [dedicated README file](./bench_compare/README.md).
## `dump_events`
This is a debugging helper for the parser. It outputs events emitted by the parser for a given file. This can be paired with the `YAMLRUST2_DEBUG` environment variable to have an in-depth overview of which steps the scanner and the parser are taking.
### Example
Consider the following `input.yaml` YAML file:
```yaml
- foo: bar
- baz:
c: [3, 4, 5]
```
Running `cargo run --bin dump_events -- input.yaml` outputs:
```
↳ StreamStart
↳ DocumentStart
↳ SequenceStart(0, None)
↳ MappingStart(0, None)
↳ Scalar("foo", Plain, 0, None)
↳ Scalar("bar", Plain, 0, None)
↳ MappingEnd
↳ MappingStart(0, None)
↳ Scalar("baz", Plain, 0, None)
↳ Scalar("~", Plain, 0, None)
↳ Scalar("c", Plain, 0, None)
↳ SequenceStart(0, None)
↳ Scalar("3", Plain, 0, None)
↳ Scalar("4", Plain, 0, None)
↳ Scalar("5", Plain, 0, None)
↳ SequenceEnd
↳ MappingEnd
↳ SequenceEnd
↳ DocumentEnd
↳ StreamEnd
```
Running `YAMLRUST2_DEBUG=1 cargo run --bin dump_events -- input.yaml` outputs much more details:
<details>
<summary> Full output </summary>
```
Parser state: StreamStart
↳ StreamStart(Utf8) Marker { index: 0, line: 1, col: 0 }
↳ StreamStart
Parser state: ImplicitDocumentStart
→ fetch_next_token after whitespace Marker { index: 0, line: 1, col: 0 } '-'
↳ BlockSequenceStart Marker { index: 0, line: 1, col: 0 }
↳ DocumentStart
Parser state: BlockNode
↳ SequenceStart(0, None)
Parser state: BlockSequenceFirstEntry
↳ BlockEntry Marker { index: 2, line: 1, col: 2 }
→ fetch_next_token after whitespace Marker { index: 2, line: 1, col: 2 } 'f'
→ fetch_next_token after whitespace Marker { index: 5, line: 1, col: 5 } ':'
↳ BlockMappingStart Marker { index: 5, line: 1, col: 5 }
↳ MappingStart(0, None)
Parser state: BlockMappingFirstKey
↳ Key Marker { index: 2, line: 1, col: 2 }
↳ Scalar(Plain, "foo") Marker { index: 2, line: 1, col: 2 }
↳ Scalar("foo", Plain, 0, None)
Parser state: BlockMappingValue
↳ Value Marker { index: 5, line: 1, col: 5 }
→ fetch_next_token after whitespace Marker { index: 7, line: 1, col: 7 } 'b'
↳ Scalar(Plain, "bar") Marker { index: 7, line: 1, col: 7 }
↳ Scalar("bar", Plain, 0, None)
Parser state: BlockMappingKey
→ fetch_next_token after whitespace Marker { index: 11, line: 2, col: 0 } '-'
↳ BlockEnd Marker { index: 11, line: 2, col: 0 }
↳ MappingEnd
Parser state: BlockSequenceEntry
↳ BlockEntry Marker { index: 13, line: 2, col: 2 }
→ fetch_next_token after whitespace Marker { index: 13, line: 2, col: 2 } 'b'
→ fetch_next_token after whitespace Marker { index: 16, line: 2, col: 5 } ':'
↳ BlockMappingStart Marker { index: 16, line: 2, col: 5 }
↳ MappingStart(0, None)
Parser state: BlockMappingFirstKey
↳ Key Marker { index: 13, line: 2, col: 2 }
↳ Scalar(Plain, "baz") Marker { index: 13, line: 2, col: 2 }
↳ Scalar("baz", Plain, 0, None)
Parser state: BlockMappingValue
↳ Value Marker { index: 16, line: 2, col: 5 }
→ fetch_next_token after whitespace Marker { index: 20, line: 3, col: 2 } 'c'
→ fetch_next_token after whitespace Marker { index: 21, line: 3, col: 3 } ':'
↳ Key Marker { index: 20, line: 3, col: 2 }
↳ Scalar("~", Plain, 0, None)
Parser state: BlockMappingKey
↳ Scalar(Plain, "c") Marker { index: 20, line: 3, col: 2 }
↳ Scalar("c", Plain, 0, None)
Parser state: BlockMappingValue
↳ Value Marker { index: 21, line: 3, col: 3 }
→ fetch_next_token after whitespace Marker { index: 23, line: 3, col: 5 } '['
↳ FlowSequenceStart Marker { index: 23, line: 3, col: 5 }
↳ SequenceStart(0, None)
Parser state: FlowSequenceFirstEntry
→ fetch_next_token after whitespace Marker { index: 24, line: 3, col: 6 } '3'
→ fetch_next_token after whitespace Marker { index: 25, line: 3, col: 7 } ','
↳ Scalar(Plain, "3") Marker { index: 24, line: 3, col: 6 }
↳ Scalar("3", Plain, 0, None)
Parser state: FlowSequenceEntry
↳ FlowEntry Marker { index: 25, line: 3, col: 7 }
→ fetch_next_token after whitespace Marker { index: 27, line: 3, col: 9 } '4'
→ fetch_next_token after whitespace Marker { index: 28, line: 3, col: 10 } ','
↳ Scalar(Plain, "4") Marker { index: 27, line: 3, col: 9 }
↳ Scalar("4", Plain, 0, None)
Parser state: FlowSequenceEntry
↳ FlowEntry Marker { index: 28, line: 3, col: 10 }
→ fetch_next_token after whitespace Marker { index: 30, line: 3, col: 12 } '5'
→ fetch_next_token after whitespace Marker { index: 31, line: 3, col: 13 } ']'
↳ Scalar(Plain, "5") Marker { index: 30, line: 3, col: 12 }
↳ Scalar("5", Plain, 0, None)
Parser state: FlowSequenceEntry
↳ FlowSequenceEnd Marker { index: 31, line: 3, col: 13 }
↳ SequenceEnd
Parser state: BlockMappingKey
→ fetch_next_token after whitespace Marker { index: 33, line: 4, col: 0 } '\0'
↳ BlockEnd Marker { index: 33, line: 4, col: 0 }
↳ MappingEnd
Parser state: BlockSequenceEntry
↳ BlockEnd Marker { index: 33, line: 4, col: 0 }
↳ SequenceEnd
Parser state: DocumentEnd
↳ StreamEnd Marker { index: 33, line: 4, col: 0 }
↳ DocumentEnd
Parser state: DocumentStart
↳ StreamEnd
```
</details>
While this cannot be shown in Markdown, the output is colored so that it is a bit easier to read.
## `gen_large_yaml`
It is hard to find large (100+MiB) real-world YAML files that could be used to benchmark a parser. This utility generates multiple large files that are meant to stress the parser with different layouts of YAML files. The resulting files do not look like anything that would be encountered in production, but can serve as a base to test several features of a YAML parser.
The generated files are the following:
- `big.yaml`: A large array of records with few fields. One of the fields is a description, a large text block scalar spanning multiple lines. Most of the scanning happens in block scalars.
- `nested.yaml`: Very short key-value pairs that nest deeply.
- `small_objects.yaml`: A large array of 2 key-value mappings.
- `strings_array.yaml`: A large array of lipsum one-liners (~150-175 characters in length).
All generated files are meant to be between 200 and 250 MiB in size.
This tool depends on external dependencies that are not part of `yaml-rust2`'s dependencies or `dev-dependencies` and as such can't be called through `cargo run` directly. A dedicated `cargo gen_large_yaml` alias can be used to generate the benchmark files.
## `run_bench`
This is a benchmarking helper that runs the parser on the given file a given number of times and is able to extract simple metrics out of the results. The `--output-yaml` flag can be specified to make the output a YAML file that can be fed into other tools.
This binary is made to be used by `bench_compare`.
Synopsis: `run_bench input.yaml <iterations> [--output-yaml]`
### Examples
```sh
$> cargo run --release --bin run_bench -- bench_yaml/big.yaml 10
Average: 1.631936191s
Min: 1.629654651s
Max: 1.633045284s
95%: 1.633045284s
$> cargo run --release --bin run_bench -- bench_yaml/big.yaml 10 --output-yaml
parser: yaml-rust2
input: bench_yaml/big.yaml
average: 1649847674
min: 1648277149
max: 1651936305
percentile95: 1651936305
iterations: 10
times:
- 1650216129
- 1649349978
- 1649507018
- 1648277149
- 1649036548
- 1650323982
- 1650917692
- 1648702081
- 1650209860
- 1651936305
```
## `time_parse`
This is a benchmarking helper that times how long it takes for the parser to emit all events. It calls the parser on the given input file, receives parsing events and then immediately discards them. It is advised to run this tool with `--release`.
### Examples
Loading a small file could output the following:
```sh
$> cargo run --release --bin time_parse -- input.yaml
Loaded 0MiB in 14.189µs
```
While loading a larger file could output the following:
```sh
$> cargo run --release --bin time_parse -- bench_yaml/big.yaml
Loaded 220MiB in 1.612677853s
```

1
bench/tools/bench_compare/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

266
bench/tools/bench_compare/Cargo.lock generated Normal file
View file

@ -0,0 +1,266 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "addr2line"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
dependencies = [
"gimli",
]
[[package]]
name = "adler"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "anyhow"
version = "1.0.81"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247"
dependencies = [
"backtrace",
]
[[package]]
name = "backtrace"
version = "0.3.71"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d"
dependencies = [
"addr2line",
"cc",
"cfg-if",
"libc",
"miniz_oxide",
"object",
"rustc-demangle",
]
[[package]]
name = "bench_compare"
version = "0.6.0"
dependencies = [
"anyhow",
"serde",
"serde_yaml",
"toml",
]
[[package]]
name = "cc"
version = "1.0.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "equivalent"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "gimli"
version = "0.28.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
[[package]]
name = "hashbrown"
version = "0.14.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
[[package]]
name = "indexmap"
version = "2.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
dependencies = [
"equivalent",
"hashbrown",
]
[[package]]
name = "itoa"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "libc"
version = "0.2.153"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
[[package]]
name = "memchr"
version = "2.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
[[package]]
name = "miniz_oxide"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
dependencies = [
"adler",
]
[[package]]
name = "object"
version = "0.32.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
dependencies = [
"memchr",
]
[[package]]
name = "proc-macro2"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rustc-demangle"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
[[package]]
name = "ryu"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1"
[[package]]
name = "serde"
version = "1.0.197"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.197"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_spanned"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb3622f419d1296904700073ea6cc23ad690adbd66f13ea683df73298736f0c1"
dependencies = [
"serde",
]
[[package]]
name = "serde_yaml"
version = "0.9.34+deprecated"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
dependencies = [
"indexmap",
"itoa",
"ryu",
"serde",
"unsafe-libyaml",
]
[[package]]
name = "syn"
version = "2.0.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "toml"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9dd1545e8208b4a5af1aa9bbd0b4cf7e9ea08fabc5d0a5c67fcaafa17433aa3"
dependencies = [
"serde",
"serde_spanned",
"toml_datetime",
"toml_edit",
]
[[package]]
name = "toml_datetime"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1"
dependencies = [
"serde",
]
[[package]]
name = "toml_edit"
version = "0.22.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e40bb779c5187258fd7aad0eb68cb8706a0a81fa712fbea808ab43c4b8374c4"
dependencies = [
"indexmap",
"serde",
"serde_spanned",
"toml_datetime",
"winnow",
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
name = "unsafe-libyaml"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
[[package]]
name = "winnow"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dffa400e67ed5a4dd237983829e66475f0a4a26938c4b04c21baede6262215b8"
dependencies = [
"memchr",
]

View file

@ -0,0 +1,21 @@
[package]
name = "bench_compare"
version = "0.6.0"
authors = [
"Ethiraric <ethiraric@gmail.com>"
]
license = "MIT OR Apache-2.0"
description = "Run multiple YAML parsers and compare their times"
repository = "https://github.com/Ethiraric/yaml-rust2"
readme = "README.md"
edition = "2018"
[dependencies]
anyhow = { version = "1.0.81", features = ["backtrace"] }
serde = { version = "1.0.197", features = ["derive"] }
serde_yaml = "0.9.32"
toml = "0.8.11"
[profile.release-lto]
inherits = "release"
lto = true

View file

@ -0,0 +1,120 @@
# `bench_compare`
This tool helps with comparing times different YAML parsers take to parse the same input.
## Synopsis
```
bench_compare time_parse
bench_compare run_bench
```
This will run either `time_parse` or `run_bench` (described below) with the given set of parsers from the configuration file.
## Parsers requirements
Parsers are expected to be event-based. In order to be fair to this crate's benchmark implementation, parsers should:
* Load the file into memory (a string, `mmap`, ...) **prior** to starting the clock
* Initialize the parser, if needed
* **Start the clock**
* Read events from the parser while the parser has not finished parsing
* Discard events as they are received (dropping them, `free`ing them or anything similar) so as to not grow their memory consumption too high, and allowing the parser to reuse event structures
* **Stop the clock**
* Destroy the resources, if needed/wanted (parser, file buffer, ...). The kernel will reap after the process exits.
## Parsers required binaries
This tool recognizes 2 binaries: `time_parse` and `run_bench`.
### `time_parse`
Synopsis:
```
time_parse file.yaml [--short]
```
The binary must run the aforementioned steps and display on its output the time the parser took to parse the given file.
With the `--short` option, the binary must only output the benchmark time in nanoseconds.
```sh
# This is meant to be human-readable.
# The example below is what this crate implements.
$> time_parse file.yaml
Loaded 200MiB in 1.74389s.
# This will be read by this tool.
# This must output ONLY the time, in nanoseconds.
$> time_parse file.yaml --short
1743892394
```
This tool will always provide the `--short` option.
### `run_bench`
Synopsis:
```
run_bench file.yaml <iterations> [--output-yaml]
```
The binary is expected to run `<iteration>` runs of the aforementioned steps and display on its output relevant information.
The `--output-yaml` instructs the binary to output details about its runs in YAML on its standard output.
The binary may optionally perform some warmup runs prior to running the benchmark. The time it took the binary to run will not be evaluated.
```sh
# This is meant to be human-readable.
# The example below is what this crate implements.
$> run_bench file.yaml 100
Average: 1.589485s
Min : 1.583078s
Max : 1.597028s
95% : 1.593219s
# This will be read by this tool.
# This must output a YAML as described below.
$> run_bench ../file.yaml 10 --output-yaml
parser: yaml-rust2
input: ../file.yaml
average: 1620303590
min: 1611632108
max: 1636401896
percentile95: 1636401896
iterations: 10
times:
- 1636401896
- 1623914538
- 1611632108
- 1612973608
- 1617748930
- 1615419514
- 1612172250
- 1620791346
- 1629339306
- 1622642412
```
The expected fields are (all times in nanoseconds):
* `parser`: The name of the parser (in case of a mistake renaming files)
* `input`: The path to the input file as given to the binary arguments
* `average`: The average time it took to run the parser
* `min`: The shortest time it took to run the parser
* `max`: The longest time it took to run the parser
* `percentile95`: The 95th percentile time of the runs
* `iterations`: The number of times the parser was run (`<iterations>`)
* `times`: An array of `iterations` times, one for each run, in the order they were run (first run first)
## Configuration
`bench_compare` is configured through a `bench_compare.toml` file. This file must be located in the current directory.
As of now, default values are unsupported and all fields must be set. The following fields are required:
```toml
yaml_input_dir = "bench_yaml" # The path to the directory containing the input yaml files
iterations = 10 # The number of iterations, if using `run_bench`
yaml_output_dir = "yaml_output" # The directory in which `run_bench`'s yamls are saved
csv_output = "benchmark.csv" # The CSV output aggregating times for each parser and file
[[parsers]] # A parser, can be repeated as many times as there are parsers
name = "yaml-rust2" # The name of the parser (used for logging)
path = "target/release/" # The path in which the parsers' `run_bench` and `time_parse` are
# If there is another parser, another block can be added
# [[parsers]]
# name = "libfyaml"
# path = "../libfyaml/build"
```

View file

@ -0,0 +1,174 @@
use std::{fs::File, io::BufWriter, io::Write, path::Path};
use anyhow::Error;
use serde::{Deserialize, Serialize};
fn main() {
if let Err(e) = entrypoint() {
eprintln!("{e:?}");
std::process::exit(1);
}
}
fn entrypoint() -> Result<(), Error> {
let config: Config =
toml::from_str(&std::fs::read_to_string("bench_compare.toml").unwrap()).unwrap();
if config.parsers.is_empty() {
println!("Please add at least one parser. Refer to the README for instructions.");
return Ok(());
}
let args: Vec<_> = std::env::args().collect();
if args.len() != 2
|| (args.len() == 2 && !["time_parse", "run_bench"].contains(&args[1].as_str()))
{
println!("Usage: bench_compare <time_parse|run_bench>");
return Ok(());
}
match args[1].as_str() {
"run_bench" => run_bench(&config)?,
"time_parse" => unimplemented!(),
_ => unreachable!(),
}
Ok(())
}
/// Run the `run_bench` binary on the given parsers.
fn run_bench(config: &Config) -> Result<(), Error> {
// Create output directory
std::fs::create_dir_all(&config.yaml_output_dir)?;
let inputs = list_input_files(config)?;
let iterations = format!("{}", config.iterations);
let mut averages = vec![];
// Inputs are ordered, so are parsers.
for input in &inputs {
let input_basename = Path::new(&input).file_name().unwrap().to_string_lossy();
let mut input_times = vec![];
// Run each input for each parser.
for parser in &config.parsers {
println!("Running {input_basename} against {}", parser.name);
// Run benchmark
let path = Path::new(&parser.path).join("run_bench");
let output = std::process::Command::new(path)
.arg(input)
.arg(&iterations)
.arg("--output-yaml")
.output()?;
// Check exit status.
if output.status.code().unwrap_or(1) == 0 {
let s = String::from_utf8_lossy(&output.stdout);
// Get output as yaml.
match serde_yaml::from_str::<BenchYamlOutput>(&s) {
Ok(output) => {
// Push average into our CSV-to-be.
input_times.push(output.average);
// Save the YAML for later.
serde_yaml::to_writer(
BufWriter::new(File::create(format!(
"{}/{}-{}",
config.yaml_output_dir, parser.name, input_basename
))?),
&output,
)?;
}
Err(e) => {
// Yaml is invalid, use 0 as "didn't run properly".
println!("Errored: Invalid YAML output: {e}");
input_times.push(0);
}
}
} else {
// An error happened, use 0 as "didn't run properly".
println!("Errored: process did exit non-zero");
input_times.push(0);
}
}
averages.push(input_times);
}
// Finally, save a CSV.
save_run_bench_csv(config, &inputs, &averages)
}
/// General configuration structure.
#[derive(Serialize, Deserialize)]
struct Config {
/// The path to the directory containing the input yaml files.
yaml_input_dir: String,
/// Number of iterations to run, if using `run_bench`.
iterations: u32,
/// The parsers to run.
parsers: Vec<Parser>,
/// The path to the directory in which `run_bench`'s yamls are saved.
yaml_output_dir: String,
/// The path to the CSV output aggregating times for each parser and file.
csv_output: String,
}
/// A parser configuration.
#[derive(Serialize, Deserialize)]
struct Parser {
/// The name of the parser.
name: String,
/// The path in which the parser's `run_bench` and `time_parse` are located.
path: String,
}
/// Ourput of running `run_bench` on a given parser.
#[derive(Serialize, Deserialize)]
struct BenchYamlOutput {
/// The name of the parser.
parser: String,
/// The file taken as input.
input: String,
/// Average parsing time (ns).
average: u64,
/// Shortest parsing time (ns).
min: u64,
/// Longest parsing time (ns).
max: u64,
/// 95th percentile of parsing times (ns).
percentile95: u64,
/// Number of iterations.
iterations: u64,
/// Parsing times for each run.
times: Vec<u64>,
}
/// Save a CSV file with all averages from `run_bench`.
fn save_run_bench_csv(
config: &Config,
inputs: &[String],
averages: &[Vec<u64>],
) -> Result<(), Error> {
let mut csv = BufWriter::new(File::create(&config.csv_output)?);
for parser in &config.parsers {
write!(csv, ",{}", parser.name,)?;
}
writeln!(csv)?;
for (path, averages) in inputs.iter().zip(averages.iter()) {
let filename = Path::new(path).file_name().unwrap().to_string_lossy();
write!(csv, "{}", filename)?;
for avg in averages {
write!(csv, ",{avg}")?;
}
writeln!(csv)?;
}
Ok(())
}
/// Returns the paths to the input yaml files.
fn list_input_files(config: &Config) -> Result<Vec<String>, Error> {
Ok(std::fs::read_dir(&config.yaml_input_dir)?
.filter_map(Result::ok)
.map(|entry| entry.path().to_string_lossy().to_string())
.filter(|path| {
Path::new(path)
.extension()
.map_or(false, |ext| ext.eq_ignore_ascii_case("yaml"))
})
.collect())
}

View file

@ -0,0 +1,38 @@
use std::env;
use std::fs::File;
use std::io::prelude::*;
use yaml_rust2::{
parser::{MarkedEventReceiver, Parser},
scanner::Marker,
Event,
};
#[derive(Debug)]
struct EventSink {
events: Vec<(Event, Marker)>,
}
impl MarkedEventReceiver for EventSink {
fn on_event(&mut self, ev: Event, mark: Marker) {
eprintln!(" \x1B[;34m\u{21B3} {:?}\x1B[;m", &ev);
self.events.push((ev, mark));
}
}
fn str_to_events(yaml: &str) -> Vec<(Event, Marker)> {
let mut sink = EventSink { events: Vec::new() };
let mut parser = Parser::new_from_str(yaml);
// Load events using our sink as the receiver.
parser.load(&mut sink, true).unwrap();
sink.events
}
fn main() {
let args: Vec<_> = env::args().collect();
let mut f = File::open(&args[1]).unwrap();
let mut s = String::new();
f.read_to_string(&mut s).unwrap();
// dbg!(str_to_events(&s));
str_to_events(&s);
}

1
bench/tools/gen_large_yaml/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

86
bench/tools/gen_large_yaml/Cargo.lock generated Normal file
View file

@ -0,0 +1,86 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "gen_large_yaml"
version = "0.6.0"
dependencies = [
"lipsum",
"rand",
]
[[package]]
name = "getrandom"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "libc"
version = "0.2.153"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
[[package]]
name = "lipsum"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "636860251af8963cc40f6b4baadee105f02e21b28131d76eba8e40ce84ab8064"
dependencies = [
"rand",
"rand_chacha",
]
[[package]]
name = "ppv-lite86"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
]
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

View file

@ -0,0 +1,19 @@
[package]
name = "gen_large_yaml"
version = "0.6.0"
authors = [
"Ethiraric <ethiraric@gmail.com>"
]
license = "MIT OR Apache-2.0"
description = "A helper to generate large YAML files"
repository = "https://github.com/Ethiraric/yaml-rust2"
readme = "README.md"
edition = "2018"
[dependencies]
rand = { version = "0.8.5", features = [ "small_rng" ] }
lipsum = "0.9.0"
[profile.release-lto]
inherits = "release"
lto = true

View file

@ -0,0 +1,156 @@
#![allow(clippy::too_many_arguments)]
use rand::{distributions::Alphanumeric, rngs::SmallRng, Rng};
/// Generate a string with hexadecimal digits of the specified length.
pub fn hex_string(rng: &mut SmallRng, len: usize) -> String {
const DIGITS: &[u8] = b"0123456789abcdef";
string_from_set(rng, len, len + 1, DIGITS)
}
/// Generate an e-mail address.
pub fn email(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String {
const CHARSET: &[u8] = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_.0123456789";
format!(
"{}@example.com",
string_from_set(rng, len_lo, len_hi, CHARSET)
)
}
/// Generate a random URL.
pub fn url(
rng: &mut SmallRng,
scheme: &str,
n_paths_lo: usize,
n_paths_hi: usize,
path_len_lo: usize,
path_len_hi: usize,
extension: Option<&str>,
) -> String {
let mut string = format!("{scheme}://example.com");
for _ in 0..rng.gen_range(n_paths_lo..n_paths_hi) {
string.push('/');
string.push_str(&alnum_string(rng, path_len_lo, path_len_hi));
}
if let Some(extension) = extension {
string.push('.');
string.push_str(extension);
}
string
}
/// Generate a random integer.
pub fn integer(rng: &mut SmallRng, lo: i64, hi: i64) -> i64 {
rng.gen_range(lo..hi)
}
/// Generate an alphanumeric string with a length between `lo_len` and `hi_len`.
pub fn alnum_string(rng: &mut SmallRng, lo_len: usize, hi_len: usize) -> String {
let len = rng.gen_range(lo_len..hi_len);
rng.sample_iter(&Alphanumeric)
.take(len)
.map(char::from)
.collect()
}
/// Generate a string with hexadecimal digits of the specified length.
pub fn string_from_set(rng: &mut SmallRng, len_lo: usize, len_hi: usize, set: &[u8]) -> String {
(0..rng.gen_range(len_lo..len_hi))
.map(|_| set[rng.gen_range(0..set.len())] as char)
.collect()
}
/// Generate a lipsum paragraph.
pub fn paragraph(
rng: &mut SmallRng,
lines_lo: usize,
lines_hi: usize,
wps_lo: usize,
wps_hi: usize,
line_maxcol: usize,
) -> Vec<String> {
let mut ret = Vec::new();
let nlines = rng.gen_range(lines_lo..lines_hi);
while ret.len() < nlines {
let words_in_sentence = rng.gen_range(wps_lo..wps_hi);
let mut sentence = lipsum::lipsum_words_with_rng(rng.clone(), words_in_sentence);
if let Some(last_line) = ret.pop() {
sentence = format!("{last_line} {sentence}");
}
while sentence.len() > line_maxcol {
let last_space_idx = line_maxcol
- sentence[0..line_maxcol]
.chars()
.rev()
.position(char::is_whitespace)
.unwrap();
ret.push(sentence[0..last_space_idx].to_string());
sentence = sentence[last_space_idx + 1..].to_string();
}
if !sentence.is_empty() {
ret.push(sentence);
}
}
ret
}
/// Generate a full name.
pub fn full_name(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String {
format!(
"{} {}",
name(rng, len_lo, len_hi),
name(rng, len_lo, len_hi)
)
}
/// Generate a name.
pub fn name(rng: &mut SmallRng, len_lo: usize, len_hi: usize) -> String {
const UPPER: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
const LOWER: &[u8] = b"abcdefghijklmnopqrstuvwxyz";
let len = rng.gen_range(len_lo..len_hi);
let mut ret = String::new();
ret.push(UPPER[rng.gen_range(0..UPPER.len())] as char);
ret.push_str(string_from_set(rng, len, len + 1, LOWER).as_str());
ret
}
/// Generate a set of words.
pub fn words(rng: &mut SmallRng, words_lo: usize, words_hi: usize) -> String {
let nwords = rng.gen_range(words_lo..words_hi);
lipsum::lipsum_words_with_rng(rng.clone(), nwords).replace(|c| "-\'\",*:".contains(c), "")
}
/// Generate a lipsum text.
///
/// Texts are composed of some paragraphs and empty lines between them.
pub fn text(
rng: &mut SmallRng,
paragraphs_lo: usize,
paragraphs_hi: usize,
lines_lo: usize,
lines_hi: usize,
wps_lo: usize,
wps_hi: usize,
line_maxcol: usize,
) -> Vec<String> {
let mut ret = Vec::new();
let mut first = true;
for _ in 0..rng.gen_range(paragraphs_lo..paragraphs_hi) {
if first {
first = false;
} else {
ret.push(String::new());
}
ret.extend(paragraph(rng, lines_lo, lines_hi, wps_lo, wps_hi, line_maxcol).into_iter());
}
ret
}

View file

@ -0,0 +1,261 @@
#![allow(dead_code)]
mod gen;
mod nested;
use std::fs::File;
use std::io::BufWriter;
use std::path::Path;
use rand::{rngs::SmallRng, Rng, SeedableRng};
/// The path into which the generated YAML files will be written.
const OUTPUT_DIR: &str = "bench_yaml";
fn main() -> std::io::Result<()> {
let mut generator = Generator::new();
let output_path = Path::new(OUTPUT_DIR);
if !output_path.is_dir() {
std::fs::create_dir(output_path).unwrap();
}
println!("Generating big.yaml");
let mut out = BufWriter::new(File::create(output_path.join("big.yaml")).unwrap());
generator.gen_record_array(&mut out, 100_000, 100_001)?;
println!("Generating nested.yaml");
let mut out = BufWriter::new(File::create(output_path.join("nested.yaml")).unwrap());
nested::create_deep_object(&mut out, 1_100_000)?;
println!("Generating small_objects.yaml");
let mut out = BufWriter::new(File::create(output_path.join("small_objects.yaml")).unwrap());
generator.gen_authors_array(&mut out, 4_000_000, 4_000_001)?;
println!("Generating strings_array.yaml");
let mut out = BufWriter::new(File::create(output_path.join("strings_array.yaml")).unwrap());
generator.gen_strings_array(&mut out, 1_300_000, 1_300_001, 10, 40)?;
Ok(())
}
/// YAML Generator.
struct Generator {
/// The RNG state.
///
/// We don't need to be cryptographically secure. [`SmallRng`] also implements the
/// [`SeedableRng`] trait, allowing runs to be predictable.
rng: SmallRng,
/// The stack of indentations.
indents: Vec<usize>,
}
type GenFn<W> = dyn FnOnce(&mut Generator, &mut W) -> std::io::Result<()>;
impl Generator {
/// Create a new generator.
fn new() -> Self {
Generator {
rng: SmallRng::seed_from_u64(42),
indents: vec![0],
}
}
/// Generate an array of records as per [`Self::gen_record_object`].
fn gen_record_array<W: std::io::Write>(
&mut self,
writer: &mut W,
items_lo: usize,
items_hi: usize,
) -> std::io::Result<()> {
self.gen_array(writer, items_lo, items_hi, Generator::gen_record_object)
}
/// Generate an array of lipsum one-liners.
fn gen_strings_array<W: std::io::Write>(
&mut self,
writer: &mut W,
items_lo: usize,
items_hi: usize,
words_lo: usize,
words_hi: usize,
) -> std::io::Result<()> {
self.gen_array(writer, items_lo, items_hi, |gen, writer| {
write!(writer, "{}", gen::words(&mut gen.rng, words_lo, words_hi))
})
}
/// Generate a YAML object/mapping containing a record.
///
/// Fields are description, hash, version, home, repository and pdf.
/// The `description` field is a long string and puts a lot of weight in plain scalar / block
/// scalar parsing.
fn gen_record_object<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
let fields: Vec<(String, Box<GenFn<W>>)> = vec![
(
"description".to_string(),
Box::new(|gen, w| {
write!(w, "|")?;
gen.push_indent(2);
gen.nl(w)?;
let indent = gen.indent();
let text = gen::text(&mut gen.rng, 1, 9, 3, 8, 10, 20, 80 - indent);
gen.write_lines(w, &text)?;
gen.pop_indent();
Ok(())
}),
),
(
"authors".to_string(),
Box::new(|gen, w| {
gen.push_indent(2);
gen.nl(w)?;
gen.gen_authors_array(w, 1, 10)?;
gen.pop_indent();
Ok(())
}),
),
(
"hash".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::hex_string(&mut gen.rng, 64))),
),
(
"version".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::integer(&mut gen.rng, 1, 9))),
),
(
"home".to_string(),
Box::new(|gen, w| {
write!(w, "{}", gen::url(&mut gen.rng, "https", 0, 1, 0, 0, None))
}),
),
(
"repository".to_string(),
Box::new(|gen, w| {
write!(w, "{}", gen::url(&mut gen.rng, "git", 1, 4, 10, 20, None))
}),
),
(
"pdf".to_string(),
Box::new(|gen, w| {
write!(
w,
"{}",
gen::url(&mut gen.rng, "https", 1, 4, 10, 30, Some("pdf"))
)
}),
),
];
self.gen_object(writer, fields)
}
/// Generate an array of authors as per [`Self::gen_author_object`].
fn gen_authors_array<W: std::io::Write>(
&mut self,
writer: &mut W,
items_lo: usize,
items_hi: usize,
) -> std::io::Result<()> {
self.gen_array(writer, items_lo, items_hi, Generator::gen_author_object)
}
/// Generate a small object with 2 string fields.
fn gen_author_object<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
let fields: Vec<(String, Box<GenFn<W>>)> = vec![
(
"name".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::full_name(&mut gen.rng, 10, 15))),
),
(
"email".to_string(),
Box::new(|gen, w| write!(w, "{}", gen::email(&mut gen.rng, 1, 9))),
),
];
self.gen_object(writer, fields)
}
/// Generate a YAML array/sequence containing nodes generated by the given function.
fn gen_array<W: std::io::Write, F: FnMut(&mut Generator, &mut W) -> std::io::Result<()>>(
&mut self,
writer: &mut W,
len_lo: usize,
len_hi: usize,
mut obj_creator: F,
) -> std::io::Result<()> {
let mut first = true;
for _ in 0..self.rng.gen_range(len_lo..len_hi) {
if first {
first = false;
} else {
self.nl(writer)?;
}
write!(writer, "- ")?;
self.push_indent(2);
(obj_creator)(self, writer)?;
self.pop_indent();
}
Ok(())
}
/// Create a Yaml object with some fields in it.
fn gen_object<W: std::io::Write>(
&mut self,
writer: &mut W,
fields: Vec<(String, Box<GenFn<W>>)>,
) -> std::io::Result<()> {
let mut first = true;
for (key, f) in fields {
if first {
first = false;
} else {
self.nl(writer)?;
}
write!(writer, "{key}: ")?;
f(self, writer)?;
}
Ok(())
}
/// Write the given lines at the right indentation.
fn write_lines<W: std::io::Write>(
&mut self,
writer: &mut W,
lines: &[String],
) -> std::io::Result<()> {
let mut first = true;
for line in lines {
if first {
first = false;
} else {
self.nl(writer)?;
}
write!(writer, "{line}")?;
}
Ok(())
}
/// Write a new line to the writer and indent.
fn nl<W: std::io::Write>(&mut self, writer: &mut W) -> std::io::Result<()> {
writeln!(writer)?;
for _ in 0..self.indent() {
write!(writer, " ")?;
}
Ok(())
}
/// Return the given indent.
fn indent(&self) -> usize {
*self.indents.last().unwrap()
}
/// Push a new indent with the given relative offset.
fn push_indent(&mut self, offset: usize) {
self.indents.push(self.indent() + offset);
}
/// Pops the last indent.
fn pop_indent(&mut self) {
self.indents.pop();
assert!(!self.indents.is_empty());
}
}

View file

@ -0,0 +1,115 @@
use std::{cell::RefCell, rc::Rc};
use rand::{rngs::SmallRng, Rng, SeedableRng};
/// Create a deep object with the given amount of nodes.
pub fn create_deep_object<W: std::io::Write>(
writer: &mut W,
n_nodes: usize,
) -> std::io::Result<()> {
let mut tree = Tree::new();
for _ in 0..n_nodes {
tree.push_node();
}
tree.write_to(writer)
}
/// An n-tree.
///
/// The algorithm used to generate a potentially deep object is to create a tree, one node at a
/// time, where each node is put as a child of a random existing node in the tree.
struct Tree {
/// The tree-view of the tree.
root: Rc<RefCell<Node>>,
/// Array of all the nodes in the tree, including the root node.
nodes: Vec<Rc<RefCell<Node>>>,
/// The RNG state.
///
/// We don't need to be cryptographically secure. [`SmallRng`] also implements the
/// [`SeedableRng`] trait, allowing runs to be predictable.
rng: SmallRng,
}
/// A node in a tree.
struct Node {
/// All the children of the node.
children: Vec<Rc<RefCell<Node>>>,
}
impl Tree {
/// Create a new tree.
fn new() -> Self {
let root = Node::new_rc_refcell();
Tree {
root: root.clone(),
nodes: vec![root],
rng: SmallRng::seed_from_u64(42),
}
}
/// Add a new node as a child of a random node in the tree.
fn push_node(&mut self) {
let new_node = Node::new_rc_refcell();
let n_nodes = self.nodes.len();
// Bias the nodes towards the end so that there is more nesting.
let parent = &mut self.nodes[self.rng.gen_range((3 * n_nodes / 4)..n_nodes)];
(**parent).borrow_mut().push_child(new_node.clone());
self.nodes.push(new_node);
}
/// Write the YAML representation of the tree to `writer`.
fn write_to<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
(*self.root).borrow().write_to(writer, 0)
}
}
impl Node {
/// Create a new node.
fn new() -> Self {
Node { children: vec![] }
}
fn new_rc_refcell() -> Rc<RefCell<Self>> {
Rc::new(RefCell::new(Self::new()))
}
/// Append a child to the node.
fn push_child(&mut self, child: Rc<RefCell<Self>>) {
self.children.push(child);
}
/// Write the YAML representation of the node to `writer`.
fn write_to<W: std::io::Write>(&self, writer: &mut W, indent: usize) -> std::io::Result<()> {
if self.children.is_empty() {
write_n(writer, ' ', indent)?;
writer.write_all(b"a: 1\n")?;
} else {
for (n, child) in self.children.iter().enumerate() {
write_n(writer, ' ', indent)?;
write_id_for_number(writer, n)?;
writer.write_all(b":\n")?;
(**child).borrow().write_to(writer, indent + 2)?;
}
}
Ok(())
}
}
/// Write `n` times `c` to `out`.
fn write_n<W: std::io::Write>(out: &mut W, c: char, n: usize) -> std::io::Result<()> {
for _ in 0..n {
write!(out, "{c}")?;
}
Ok(())
}
/// Create a valid identifier for the given number.
fn write_id_for_number<W: std::io::Write>(out: &mut W, mut n: usize) -> std::io::Result<()> {
const DIGITS: &[u8] = b"_abcdefghijklmnopqrstuvwxyz";
n += 1;
while n > 0 {
write!(out, "{}", DIGITS[n % DIGITS.len()] as char)?;
n /= DIGITS.len();
}
Ok(())
}

68
bench/tools/run_bench.rs Normal file
View file

@ -0,0 +1,68 @@
#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
use std::{env, fs::File, io::prelude::*};
use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser};
/// A sink which discards any event sent.
struct NullSink {}
impl MarkedEventReceiver for NullSink {
fn on_event(&mut self, _: Event, _: Marker) {}
}
/// Parse the given input, returning elapsed time in nanoseconds.
fn do_parse(input: &str) -> u64 {
let mut sink = NullSink {};
let mut parser = Parser::new_from_str(input);
let begin = std::time::Instant::now();
parser.load(&mut sink, true).unwrap();
let end = std::time::Instant::now();
(end - begin).as_nanos() as u64
}
fn main() {
let args: Vec<_> = env::args().collect();
let iterations: u64 = args[2].parse().unwrap();
let output_yaml = args.len() == 4 && args[3] == "--output-yaml";
let mut f = File::open(&args[1]).unwrap();
let mut s = String::new();
f.read_to_string(&mut s).unwrap();
// Warmup
do_parse(&s);
do_parse(&s);
do_parse(&s);
// Bench
let times: Vec<_> = (0..iterations).map(|_| do_parse(&s)).collect();
let mut sorted_times = times.clone();
sorted_times.sort_unstable();
// Compute relevant metrics.
let sum: u64 = times.iter().sum();
let avg = sum / iterations;
let min = sorted_times[0];
let max = sorted_times[(iterations - 1) as usize];
let percentile95 = sorted_times[((95 * iterations) / 100) as usize];
if output_yaml {
println!("parser: yaml-rust2");
println!("input: {}", args[1]);
println!("average: {avg}");
println!("min: {min}");
println!("max: {max}");
println!("percentile95: {percentile95}");
println!("iterations: {iterations}");
println!("times:");
for time in &times {
println!(" - {time}");
}
} else {
println!("Average: {}s", (avg as f64) / 1_000_000_000.0);
println!("Min: {}s", (min as f64) / 1_000_000_000.0);
println!("Max: {}s", (max as f64) / 1_000_000_000.0);
println!("95%: {}s", (percentile95 as f64) / 1_000_000_000.0);
}
}

33
bench/tools/time_parse.rs Normal file
View file

@ -0,0 +1,33 @@
use std::env;
use std::fs::File;
use std::io::prelude::*;
use saphyr_parser::{Event, MarkedEventReceiver, Marker, Parser};
/// A sink which discards any event sent.
struct NullSink {}
impl MarkedEventReceiver for NullSink {
fn on_event(&mut self, _: Event, _: Marker) {}
}
fn main() {
let args: Vec<_> = env::args().collect();
let mut f = File::open(&args[1]).unwrap();
let mut s = String::new();
f.read_to_string(&mut s).unwrap();
let mut sink = NullSink {};
let mut parser = Parser::new_from_str(&s);
// Load events using our sink as the receiver.
let begin = std::time::Instant::now();
parser.load(&mut sink, true).unwrap();
let end = std::time::Instant::now();
if args.len() == 3 && args[2] == "--short" {
println!("{}", (end - begin).as_nanos());
} else {
println!("Loaded {}MiB in {:?}", s.len() / 1024 / 1024, end - begin);
}
}

@ -1 +0,0 @@
Subproject commit 45db50aecf9b1520f8258938c88f396e96f30831

View file

@ -0,0 +1,3 @@
[alias]
gen_large_yaml = "run --profile=release-lto --package gen_large_yaml --bin gen_large_yaml --manifest-path tools/gen_large_yaml/Cargo.toml --"
bench_compare = "run --package bench_compare --bin bench_compare --manifest-path tools/bench_compare/Cargo.toml --"

1
saphyr/.gitattributes vendored Normal file
View file

@ -0,0 +1 @@
tests/*.rs.inc linguist-language=Rust linguist-generated

39
saphyr/.github/workflows/ci.yml vendored Normal file
View file

@ -0,0 +1,39 @@
name: CI
on:
pull_request:
push:
branches:
- master
jobs:
check:
name: Lints and checks
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- run: rustup toolchain install stable --profile minimal --component rustfmt --component clippy --no-self-update
- uses: Swatinem/rust-cache@v2
- name: Run clippy checks
run: cargo clippy --all-targets -- -D warnings
- name: Run format checks
run: cargo fmt --check
test:
name: Test using Rust ${{ matrix.rust }} on ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
rust: [stable]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout
uses: actions/checkout@v3
- run: rustup toolchain install ${{ matrix.rust }} --profile minimal --no-self-update
- uses: Swatinem/rust-cache@v2
- name: Run build
run: cargo build
- name: Run tests
run: cargo test -v

5
saphyr/.gitignore vendored Normal file
View file

@ -0,0 +1,5 @@
target
Cargo.lock
*.swp
/perf.*
/coverage.sh

View file

@ -0,0 +1,191 @@
Copyright (c) 2015 Chen Yuheng
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS

View file

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2015 Chen Yuheng
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,191 @@
Copyright (c) 2023 Ethiraric
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS

View file

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2023 Ethiraric
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

125
saphyr/CHANGELOG.md Normal file
View file

@ -0,0 +1,125 @@
# Changelog
## Upcoming
**Breaking Changes**:
- Move `load_from_*` methods out of the `YamlLoader`. Now, `YamlLoader` gained
a generic parameter. Moving those functions out of it spares having to
manually specify the generic in `YamlLoader::<Yaml>::load_from_str`.
Manipulating the `YamlLoader` directly was not common.
- Make `LoadError` `Clone` by storing an `Arc<std::io::Error>` instead of the
error directly.
**Features**:
- ([#19](https://github.com/Ethiraric/yaml-rust2/pull/19)) `Yaml` now
implements `IndexMut<usize>` and `IndexMut<&'a str>`. These functions may not
return a mutable reference to a `BAD_VALUE`. Instead, `index_mut()` will
panic if either:
* The index is out of range, as per `IndexMut`'s requirements
* The inner `Yaml` variant doesn't match `Yaml::Array` for `usize` or
`Yaml::Hash` for `&'a str`
- Use cargo features
This allows for more fine-grained control over MSRV and to completely remove
debug code from the library when it is consumed.
The `encoding` feature, governing the `YamlDecoder`, has been enabled by
default. Users of `@davvid`'s fork of `yaml-rust` or of `yaml-rust2` might
already use this. Users of the original `yaml-rust` crate may freely disable
this feature (`cargo <...> --no-default-features`) and lower MSRV to 1.65.0.
- Load with metadata
The `YamlLoader` now supports adding metadata alongside the nodes. For now,
the only one supported is the `Marker`, pointing to the position in the input
stream of the start of the node.
This feature is extensible and should allow (later) to add comments.
## v0.8.0
**Breaking Changes**:
- The `encoding` library has been replaced with `encoding_rs`. If you use the
`trap` of `YamlDecoder`, this change will make your code not compile.
An additional enum `YamlDecoderTrap` has been added to abstract the
underlying library and avoid breaking changes in the future. This
additionally lifts the `encoding` dependency on _your_ project if you were
using that feature.
- The signature of the function for `YamlDecoderTrap::Call` has changed:
- The `encoding::types::DecoderTrap` has been replaced with `YamlDecoderTrap`.
```rust
// Before, with `encoding::types::DecoderTrap::Call`
fn(_: &mut encoding::RawDecoder, _: &[u8], _: &mut encoding::StringWriter) -> bool;
// Now, with `YamlDecoderTrap::Call`
fn(_: u8, _: u8, _: &[u8], _: &mut String) -> ControlFlow<Cow<'static str>>;
```
Please refer to the `YamlDecoderTrapFn` documentation for more details.
**Features**:
- Tags can now be retained across documents by calling `keep_tags(true)` on a
`Parser` before loading documents.
([#10](https://github.com/Ethiraric/yaml-rust2/issues/10)
([#12](https://github.com/Ethiraric/yaml-rust2/pull/12))
- `YamlLoader` structs now have a `documents()` method that returns the parsed
documents associated with a loader.
- `Parser::new_from_str(&str)` and `YamlLoader::load_from_parser(&Parser)` were added.
**Development**:
- Linguist attributes were added for the `tests/*.rs.inc` files to prevent github from
classifying them as C++ files.
## v0.7.0
**Features**:
- Multi-line strings are now
[emitted using block scalars](https://github.com/chyh1990/yaml-rust/pull/136).
- Error messages now contain a byte offset to aid debugging.
([#176](https://github.com/chyh1990/yaml-rust/pull/176))
- Yaml now has `or` and `borrowed_or` methods.
([#179](https://github.com/chyh1990/yaml-rust/pull/179))
- `Yaml::load_from_bytes()` is now available.
([#156](https://github.com/chyh1990/yaml-rust/pull/156))
- The parser and scanner now return Err() instead of calling panic.
**Development**:
- The documentation was updated to include a security note mentioning that
yaml-rust is safe because it does not interpret types.
([#195](https://github.com/chyh1990/yaml-rust/pull/195))
- Updated to quickcheck 1.0.
([#188](https://github.com/chyh1990/yaml-rust/pull/188))
- `hashlink` is [now used](https://github.com/chyh1990/yaml-rust/pull/157)
instead of `linked_hash_map`.
## v0.6.0
**Development**:
- `is_xxx` functions were moved into the private `char_traits` module.
- Benchmarking tools were added.
- Performance was improved.
## v0.5.0
- The parser now supports tag directives.
([#35](https://github.com/chyh1990/yaml-rust/issues/35)
- The `info` field has been exposed via a new `Yaml::info()` API method.
([#190](https://github.com/chyh1990/yaml-rust/pull/190))

34
saphyr/Cargo.toml Normal file
View file

@ -0,0 +1,34 @@
[package]
name = "saphyr"
version = "0.0.1"
authors = [
"Yuheng Chen <yuhengchen@sensetime.com>",
"Ethiraric <ethiraric@gmail.com>",
"David Aguilar <davvid@gmail.com>"
]
documentation = "https://docs.rs/saphyr"
keywords = [ "yaml", "parser" ]
categories = [ "encoding", "parser-implementations" ]
license = "MIT OR Apache-2.0"
description = "A fully YAML 1.2 compliant YAML library"
repository = "https://github.com/saphyr-rs/saphyr"
readme = "README.md"
edition = "2021"
rust-version = "1.70.0"
[features]
default = [ "encoding" ]
encoding = [ "dep:encoding_rs" ]
[dependencies]
arraydeque = "0.5.1"
saphyr-parser = "0.0.2"
encoding_rs = { version = "0.8.33", optional = true }
hashlink = "0.8"
[dev-dependencies]
quickcheck = "1.0"
[profile.release-lto]
inherits = "release"
lto = true

9
saphyr/LICENSE Normal file
View file

@ -0,0 +1,9 @@
Code up to and including commit `da52a68615f2ecdd6b7e4567019f280c433c1521` is licensed by Chen Yuheng under either of:
- [Apache License, Version 2.0](.licenses/ChenYuheng-Apache) (http://www.apache.org/licenses/LICENSE-2.0)
- [MIT License](./licenses/ChenYuheng-MIT) (http://opensource.org/licenses/MIT)
Code modifications starting with commit `1d71a23b151dcc12b289d0f06d8207dd9c764216` (included) are licenced by Ethiraric under either of:
- [Apache License, Version 2.0](.licenses/Ethiraric-Apache) (http://www.apache.org/licenses/LICENSE-2.0)
- [MIT License](./licenses/Ethiraric-MIT) (http://opensource.org/licenses/MIT)
Redistributions of this Work must include licenses of both Chen Yuheng and Ethiraric.

135
saphyr/README.md Normal file
View file

@ -0,0 +1,135 @@
# saphyr
[saphyr](https://github.com/saphyr-rs/saphyr) is a fully compliant YAML 1.2
library written in pure Rust.
This work is based on [`yaml-rust`](https://github.com/chyh1990/yaml-rust) with
fixes towards being compliant to the [YAML test
suite](https://github.com/yaml/yaml-test-suite/). `yaml-rust`'s parser is
heavily influenced by `libyaml` and `yaml-cpp`.
`saphyr` is a pure Rust YAML 1.2 implementation that benefits from the
memory safety and other benefits from the Rust language.
## Quick Start
### Installing
Add the following to your Cargo.toml:
```toml
[dependencies]
saphyr = "0.0.1"
```
or use `cargo add` to get the latest version automatically:
```sh
cargo add saphyr
```
### Example
Use `saphyr::YamlLoader` to load YAML documents and access them as `Yaml` objects:
```rust
use saphyr::{Yaml, YamlEmitter};
fn main() {
let s =
"
foo:
- list1
- list2
bar:
- 1
- 2.0
";
let docs = Yaml::load_from_str(s).unwrap();
// Multi document support, doc is a yaml::Yaml
let doc = &docs[0];
// Debug support
println!("{:?}", doc);
// Index access for map & array
assert_eq!(doc["foo"][0].as_str().unwrap(), "list1");
assert_eq!(doc["bar"][1].as_f64().unwrap(), 2.0);
// Array/map-like accesses are checked and won't panic.
// They will return `BadValue` if the access is invalid.
assert!(doc["INVALID_KEY"][100].is_badvalue());
// Dump the YAML object
let mut out_str = String::new();
{
let mut emitter = YamlEmitter::new(&mut out_str);
emitter.dump(doc).unwrap(); // dump the YAML object to a String
}
println!("{}", out_str);
}
```
Note that `saphyr::Yaml` implements `Index<&'a str>` and `Index<usize>`:
* `Index<usize>` assumes the container is an array
* `Index<&'a str>` assumes the container is a string to value map
* otherwise, `Yaml::BadValue` is returned
Note that `annotated::YamlData` cannot return `BadValue` and will panic.
If your document does not conform to this convention (e.g. map with complex
type key), you can use the `Yaml::as_XXX` family API of functions to access
your objects.
## Features
* Pure Rust
* `Vec`/`HashMap` access API
## Security
This library does not try to interpret any type specifiers in a YAML document,
so there is no risk of, say, instantiating a socket with fields and
communicating with the outside world just by parsing a YAML document.
## Specification Compliance
This implementation is fully compatible with the YAML 1.2 specification. The
parser behind this library
([`saphyr-parser`](https://github.com/saphyr-rs/saphyr-parser)) tests against
(and passes) the [YAML test suite](https://github.com/yaml/yaml-test-suite/).
## License
Licensed under either of
* Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
* MIT license (http://opensource.org/licenses/MIT)
at your option.
Since this repository was originally maintained by
[chyh1990](https://github.com/chyh1990), there are 2 sets of licenses.
A license of each set must be included in redistributions. See the
[LICENSE](LICENSE) file for more details.
You can find licences in the [`.licenses`](.licenses) subfolder.
## Contribution
[Fork this repository](https://github.com/saphyr-rs/saphyr/fork) and
[Create a Pull Request on Github](https://github.com/saphyr-rs/saphyr/compare/master...saphyr-rs:saphyr:master).
You may need to click on "compare across forks" and select your fork's branch.
Unless you explicitly state otherwise, any contribution intentionally submitted
for inclusion in the work by you, as defined in the Apache-2.0 license, shall
be dual licensed as above, without any additional terms or conditions.
## Links
* [saphyr source code repository](https://github.com/saphyr-rs/saphyr)
* [saphyr releases on crates.io](https://crates.io/crates/saphyr)
* [saphyr documentation on docs.rs](https://docs.rs/saphyr/latest/saphyr/)
* [saphyr-parser releases on crates.io](https://crates.io/crates/saphyr-parser)
* [yaml-test-suite](https://github.com/yaml/yaml-test-suite)

65
saphyr/appveyor.yml Normal file
View file

@ -0,0 +1,65 @@
clone_depth: 1
branches:
only:
- master
environment:
LLVM_VERSION: 9.0.1
PLATFORM: x64
matrix:
- channel: stable
target: i686-pc-windows-msvc
type: msvc
- channel: stable
target: x86_64-pc-windows-msvc
type: msvc
- channel: stable
target: i686-pc-windows-gnu
type: gnu
- channel: stable
target: x86_64-pc-windows-gnu
type: gnu
- channel: nightly
target: i686-pc-windows-msvc
type: msvc
- channel: nightly
target: x86_64-pc-windows-msvc
type: msvc
- channel: nightly
target: i686-pc-windows-gnu
type: gnu
- channel: nightly
target: x86_64-pc-windows-gnu
type: gnu
install:
- if %PLATFORM% == x86 (set RUST_PLATFORM=i686&set MINGW_BITS=32) else (set RUST_PLATFORM=x86_64&set MINGW_BITS=64)
- ps: >-
If ($env:target -eq 'x86_64-pc-windows-gnu') {
$env:PATH += ';C:\msys64\mingw64\bin'
} ElseIf ($env:target -eq 'i686-pc-windows-gnu') {
$env:PATH += ';C:\msys64\mingw32\bin'
}
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
- rustup-init -yv --default-toolchain %channel% --default-host %target%
- set PATH=%PATH%;%USERPROFILE%\.cargo\bin
- rustc -vV
- cargo -vV
# Install LLVM for GNU
- if %type%==gnu set PATH=C:\msys64\mingw%MINGW_BITS%\bin;C:\msys64\usr\bin;%PATH%
- if %type%==gnu set "MINGW_URL=http://repo.msys2.org/mingw/%RUST_PLATFORM%/mingw-w64-%RUST_PLATFORM%"
- if %type%==gnu set "URL_VER=%LLVM_VERSION%-1-any.pkg.tar.xz"
- if %type%==gnu bash -lc "pacman -U --noconfirm $MINGW_URL-clang-$URL_VER $MINGW_URL-llvm-$URL_VER"
- if %type%==gnu bash -lc "clang --version"
# Use preinstalled LLVM for MSVC
- if %type%==msvc set PATH=%PATH%;C:\Program Files\LLVM\bin
- if %type%==msvc where clang
- if %type%==msvc clang --version
build_script:
- cargo build -vv
test_script:
- cargo test -vv
deploy: off

View file

@ -0,0 +1,167 @@
# `yaml-rust2`'s first real release
If you are not interested in how this crate was born and just want to know what differs from `yaml-rust`, scroll down to
["This release" or click here](#this-release).
## The why
Sometime in August 2023, an ordinary developer (that's me) felt the urge to start scribbling about an OpenAPI linter. I
had worked with the OpenAPI format and tried different linters, but none of them felt right. And me needing 3 different
linters to lint my OpenAPI was a pain to me. Like any sane person would do, I would write my own (author's note: you are
not not sane if you wouldn't). In order to get things started, I needed a YAML parser.
On August 14th 2023, I forked `yaml-rust` and started working on it. The crate stated that some YAML features were not
yet available and I felt that was an issue I could tackle. I started by getting to know the code, understanding it,
adding warnings, refactoring, tinkering, documenting, ... . Anything I could do that made me feel that codebase was
better, I would do it. I wanted this crate to be as clean as it could be.
## Fixing YAML compliance
In my quest to understand YAML better, I found [the YAML test suite](https://github.com/yaml/yaml-test-suite/): a
compilation of corner cases and intricate YAML examples with their expected output / behavior. Interestingly enough,
there was an [open pull request on yaml-rust](https://github.com/chyh1990/yaml-rust/pull/187) by
[tanriol](https://github.com/tanriol) which integrated the YAML test suite as part of the crate tests. Comments mention
that the maintainer wasn't around anymore and that new contributions would probably never be accepted.
That, however, was a problem for future-past-me, as I was determined (somehow) to have `yaml-rust` pass every single
test of the YAML test suite. Slowly, over the course of multiple months (from August 2023 to January 2024), I would
sometimes pick a test from the test suite, fix it, commit and start again. On the 23rd of January, the last commit
fixing a test was created.
According to the [YAML test matrix](https://matrix.yaml.info/), there is to this day only 1 library that is fully
compliant (aside from the Perl parser generated by the reference). This would make `yaml-rust2` the second library to be
fully YAML-compliant. You really wouldn't believe how much you have to stretch YAML so that it's not valid YAML anymore.
## Performance
With so many improvements, the crate was now perfect!.. Except for performance. Adding conditions for every little bit
of compliance has lead the code to be much more complex and branch-y, which CPUs hate. I was around 20% slower than the
code was when I started.
For a bit over 3 weeks, I stared at flamegraphs and made my CPU repeat the same instructions until it could do it
faster. There have been a bunch of improvements for performance since `yaml-rust`'s last commit. Here are a few of them:
* Avoid putting characters in a `VecDeque<char>` buffer when we can push them directly into a `String`.
* Be a bit smarter about reallocating temporaries: it's best if we know the size in advance, but when we don't we can
sometimes avoid pushing characters 1 at a time.
* The scanner skips over characters one at a time. When skipping them, it needs to check whether they're a linebreak to
update the location. Sometimes, we know we skip over a letter (which is not a linebreak). Several "skip" functions
have been added for specific uses.
And the big winner, for an around 15% decrease in runtime was: use a statically-sized buffer instead of a dynamically
allocated one. (Almost) Every character goes from the input stream into the buffer and then gets read from the buffer.
This means that `VecDeque::push` and `VecDeque::pop` were called very frequently. The former always has to check for
capacity. Using an `ArrayDeque` removed the need for constant capacity checks, at the cost of a minor decrease in
performance if a line is deeply indented. Hopefully, nobody has 42 nested YAML objects.
Here is in the end the performance breakdown:
![Comparison of the performance between `yaml-rust`, `yaml-rust2` and the C `libfyaml`. `yaml-rust2` is faster in every
test than `yaml-rust`, but `libfyaml` remains faster overall.](./img/benchmarks-v0.6.svg)
Here is a short description of what the files contain:
* `big`: A large array of records with few fields. One of the fields is a description, a large text block scalar
spanning multiple lines. Most of the scanning happens in block scalars.
* `nested`: Very short key-value pairs that nest deeply.
* `small_objects`: A large array of 2 key-value mappings.
* `strings_array`: A large array of lipsum one-liners (~150-175 characters in length).
As you can see, `yaml-rust2` performs better than `yaml-rust` on every benchmark. However, when compared against the C
[`libfyaml`](https://github.com/pantoniou/libfyaml), we can see that there is still much room for improvement.
I'd like to end this section with a small disclaimer: I am not a benchmark expert. I tried to have an heterogenous set
of files that would highlight how the parser performs when stressed different ways. I invite you to take a look at [the
code generating the YAML files](https://github.com/Ethiraric/yaml-rust2/tree/master/tools/gen_large_yaml) and, if you
are more knowledgeable than I am, improve upon them. `yaml-rust2` performs better with these files because those are the
ones I could work with. If you find a file with which `yaml-rust2` is slower than `yaml-rust`, do file an issue!
## This release
### Improvements from `yaml-rust`
This release should improve over `yaml-rust` over 3 major points:
* Performance: We all love fast software. I want to help you achieve it. I haven't managed to make this crate twice as
fast, but you should notice a 15-20% improvement in performance.
* Compliance: You may not notice it, since I didn't know most of the bugs I fixed were bugs to begin with, but this
crate should now be fully YAML-compliant.
* Documentation: The documentation of `yaml-rust` is unfortunately incomplete. Documentation here is not exhaustive,
but most items are documented. Notably, private items are documented, making it much easier to understand where
something happens. There are also in-code comments that help figure out what is going on under the hood.
Also, last but not least, I do plan on keeping this crate alive as long as I can. Nobody can make promises on that
regard, of course, but I have poured hours of work into this, and I would hate to see this go to waste.
### Switching to `yaml-rust2`
This release is `v0.6.0`, chosen to explicitly differ in minor from `yaml-rust`. `v0.4.x` does not exist in this crate
to avoid any confusion between the 2 crates.
Switching to `yaml-rust2` should be a very simple process. Change your `Cargo.toml` to use `yaml-rust2` instead of
`yaml-rust`:
```diff
-yaml-rust = "0.4.4"
+yaml-rust2 = "0.8.0"
```
As for your code, you have one of two solutions:
* Changing your imports from `use yaml_rust::Yaml` to `use yaml_rust2::Yaml` if you import items directly, or change
occurrences of `yaml_rust` to `yaml_rust2` if you use fully qualified paths.
* Alternatively, you can alias `yaml_rust2` with `use yaml_rust2 as yaml_rust`. This would keep your code working if
you use fully qualified paths.
Whichever you decide is up to you.
[Courtesy of davvid](https://github.com/chyh1990/yaml-rust/issues/160#issuecomment-2008931473), there is another
solution. You can combine both approaches and tell `Cargo.toml` to add `yaml-rust2` and to create a `yaml_rust` alias
for your code with the following:
```diff
-yaml-rust = "0.4.4"
+yaml-rust = { version = "0.6", package = "yaml-rust2" }
```
This allows you to switch to `yaml-rust2` while continuing to refer to `yaml_rust` in your code (e.g. use
`yaml_rust::YamlLoader;` will continue to work so that no Rust code changes are required).
#### What about API breakage?
Most of what I have changed is in the implementation details. You might notice more documentation appearing on your LSP,
but documentation isn't bound by the API. There is only one change I made that could lead to compile errors. It is
unlikely you used that feature, but I'd hate to leave this undocumented.
If you use the low-level event parsing API (`Parser`,
`EventReceiver` / `MarkedEventReceiver`) and namely the `yaml_rust::Event` enumeration, there is one change that might
break your code. This was needed for tests in the YAML test suite. In `yaml-rust`, YAML tags are not forwarded from the
lower-level `Scanner` API to the low-level `Parser` API.
Here is the change that was made in the library:
```diff
pub enum Event {
// ...
-SequenceStart(usize),
-MappingStart(usize),
+SequenceStart(usize, Option<Tag>),
+MappingStart(usize, Option<Tag>),
// ...
}
```
This means that you may now see YAML tags appearing in your code.
## Closing words
YAML is hard. Much more than I had anticipated. If you are exploring dark corners of YAML that `yaml-rust2` supports but
`yaml-rust` doesn't, I'm curious to know what it is.
Work on this crate is far from over. I will try and match `libfyaml`'s performance. Today is the first time I benched
against it, and I wouldn't have guessed it to outperform `yaml-rust2` that much.
If you're interested in upgrading your `yaml-rust` crate, please do take a look at [davvid](https://github.com/davvid)'s
[fork of `yaml-rust`](https://github.com/davvid/yaml-rust). Very recent developments on this crate sparked from an
[issue on advisory-db](https://github.com/rustsec/advisory-db/issues/1921) about the unmaintained state of `yaml-rust`.
I hope it will be that YAML in Rust will improve following this issue.
Thank you for reading through this. If you happen to have issues with `yaml-rust2` or suggestions, do [drop an
issue](https://github.com/Ethiraric/yaml-rust2/issues)!
If however you wanted an OpenAPI linter, I'm afraid you're out of luck. Just as much as I'm out of time ;)
-Ethiraric
EDIT(20-03-2024): Add davvid's method of switching to `yaml-rust2` by creating a Cargo alias.

View file

@ -0,0 +1,5 @@
,yaml-rust2,yaml-rust,libfyaml
big.yaml,1644933464,2097747837,1642761913
nested.yaml,1186706803,1461738560,1104480120
small_objects.yaml,5459915062,5686715239,4402878726
strings_array.yaml,1698194153,2044921291,924246153
1 yaml-rust2 yaml-rust libfyaml
2 big.yaml 1644933464 2097747837 1642761913
3 nested.yaml 1186706803 1461738560 1104480120
4 small_objects.yaml 5459915062 5686715239 4402878726
5 strings_array.yaml 1698194153 2044921291 924246153

View file

@ -0,0 +1,69 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg width="177mm" height="92mm" viewBox="0 0 17700 9200" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" stroke-width="28.222" stroke-linejoin="round" xml:space="preserve">
<path fill="rgb(255,255,255)" stroke="none" d="M 8856,9178 L -13,9178 -13,-13 17724,-13 17724,9178 8856,9178 Z"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 8976,8348 L 2926,8348 2926,370 15027,370 15027,8348 8976,8348 Z"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 15027,8347 L 2926,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 15027,7017 L 2926,7017"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 15027,5687 L 2926,5687"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 15027,4358 L 2926,4358"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 15027,3028 L 2926,3028"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 15027,1698 L 2926,1698"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 15027,368 L 2926,368"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2926,8497 L 2926,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2926,8497 L 2926,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 5951,8497 L 5951,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 5951,8497 L 5951,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 8976,8497 L 8976,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 8976,8497 L 8976,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 12001,8497 L 12001,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 12001,8497 L 12001,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 15027,8497 L 15027,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 15027,8497 L 15027,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2926,8347 L 15027,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,8347 L 2926,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,8347 L 2926,8347"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,7017 L 2926,7017"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,7017 L 2926,7017"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,5687 L 2926,5687"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,5687 L 2926,5687"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,4358 L 2926,4358"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,4358 L 2926,4358"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,3028 L 2926,3028"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,3028 L 2926,3028"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,1698 L 2926,1698"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,1698 L 2926,1698"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,368 L 2926,368"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2776,368 L 2926,368"/>
<path fill="none" stroke="rgb(179,179,179)" stroke-linejoin="round" d="M 2926,8347 L 2926,368"/>
<path fill="rgb(248,203,173)" stroke="none" d="M 3304,8347 L 4060,8347 4060,5557 3304,5557 3304,8347 Z"/>
<path fill="rgb(248,203,173)" stroke="none" d="M 6329,8347 L 7086,8347 7086,6403 6329,6403 6329,8347 Z"/>
<path fill="rgb(248,203,173)" stroke="none" d="M 9354,8347 L 10110,8347 10110,785 9354,785 9354,8347 Z"/>
<path fill="rgb(248,203,173)" stroke="none" d="M 12379,8347 L 13136,8347 13136,5627 12379,5627 12379,8347 Z"/>
<path fill="rgb(198,224,180)" stroke="none" d="M 4060,8347 L 4816,8347 4816,6159 4060,6159 4060,8347 Z"/>
<path fill="rgb(198,224,180)" stroke="none" d="M 7086,8347 L 7842,8347 7842,6768 7086,6768 7086,8347 Z"/>
<path fill="rgb(198,224,180)" stroke="none" d="M 10110,8347 L 10866,8347 10866,1087 10110,1087 10110,8347 Z"/>
<path fill="rgb(198,224,180)" stroke="none" d="M 13136,8347 L 13892,8347 13892,6088 13136,6088 13136,8347 Z"/>
<path fill="rgb(189,215,238)" stroke="none" d="M 4816,8347 L 5573,8347 5573,6162 4816,6162 4816,8347 Z"/>
<path fill="rgb(189,215,238)" stroke="none" d="M 7842,8347 L 8598,8347 8598,6878 7842,6878 7842,8347 Z"/>
<path fill="rgb(189,215,238)" stroke="none" d="M 10866,8347 L 11623,8347 11623,2492 10866,2492 10866,8347 Z"/>
<path fill="rgb(189,215,238)" stroke="none" d="M 13892,8347 L 14648,8347 14648,7117 13892,7117 13892,8347 Z"/>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="4213" y="8915"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">big</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="6948" y="8915"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">nested</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="9456" y="8915"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">small_objects</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="12522" y="8915"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">strings_array</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="2491" y="8467"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">0</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="1353" y="7137"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">1000000</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="1353" y="5807"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">2000000</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="1353" y="4478"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">3000000</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="1353" y="3148"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">4000000</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="1353" y="1818"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">5000000</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="1353" y="488"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">6000000</tspan></tspan></tspan></text>
<path fill="rgb(248,203,173)" stroke="none" d="M 15603,4190 L 15497,4190 15497,3979 15708,3979 15708,4190 15603,4190 Z"/>
<path fill="rgb(198,224,180)" stroke="none" d="M 15603,4687 L 15497,4687 15497,4477 15708,4477 15708,4687 15603,4687 Z"/>
<path fill="rgb(189,215,238)" stroke="none" d="M 15603,5185 L 15497,5185 15497,4974 15708,4974 15708,5185 15603,5185 Z"/>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="15808" y="4204"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">yaml-rust</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="15808" y="4701"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">yaml-rust2</tspan></tspan></tspan></text>
<text class="SVGTextShape"><tspan class="TextParagraph"><tspan class="TextPosition" x="15808" y="5199"><tspan font-family="Liberation Sans, sans-serif" font-size="353px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">libfyaml</tspan></tspan></tspan></text>
<text class="SVGTextShape" transform="rotate(-90 -15451 4894)"><tspan class="TextParagraph"><tspan class="TextPosition" x="824" y="6394"><tspan font-family="Liberation Sans, sans-serif" font-size="318px" font-weight="400" fill="rgb(0,0,0)" stroke="none" style="white-space: pre">Time in ms (less is better)</tspan></tspan></tspan></text>
</svg>

After

Width:  |  Height:  |  Size: 9.7 KiB

View file

@ -0,0 +1,44 @@
use saphyr::Yaml;
use std::env;
use std::fs::File;
use std::io::prelude::*;
fn print_indent(indent: usize) {
for _ in 0..indent {
print!(" ");
}
}
fn dump_node(doc: &Yaml, indent: usize) {
match *doc {
Yaml::Array(ref v) => {
for x in v {
dump_node(x, indent + 1);
}
}
Yaml::Hash(ref h) => {
for (k, v) in h {
print_indent(indent);
println!("{k:?}:");
dump_node(v, indent + 1);
}
}
_ => {
print_indent(indent);
println!("{doc:?}");
}
}
}
fn main() {
let args: Vec<_> = env::args().collect();
let mut f = File::open(&args[1]).unwrap();
let mut s = String::new();
f.read_to_string(&mut s).unwrap();
let docs = Yaml::load_from_str(&s).unwrap();
for doc in &docs {
println!("---");
dump_node(doc, 0);
}
}

40
saphyr/garden.yaml Normal file
View file

@ -0,0 +1,40 @@
# Use "cargo install garden-tools" to install garden https://gitlab.com/garden-rs/garden
#
# usage:
# garden build
# garden test
# garden check
# garden fmt
# garden fix
commands:
build: |
cargo build --all-targets --release
cargo build --all-targets
check>:
- check/clippy
- check/fmt
- build
- test
- doc
check/clippy: |
cargo clippy --all-targets --release -- -D warnings
cargo clippy --all-targets -- -D warnings
check/fmt: cargo fmt --check
doc: cargo doc --all-features
fix: cargo clippy --all-targets --fix -- -D warnings
fmt: cargo fmt
test: |
cargo test
cargo test --release
cargo test --doc
watch: cargo watch --shell "garden check"
environment:
RUSTDOCFLAGS: "-D warnings"
trees:
saphyr:
description: A pure Rust YAML implementation
path: ${GARDEN_CONFIG_DIR}
url: "git@github.com:saphyr-rs/saphyr.git"

10
saphyr/justfile Normal file
View file

@ -0,0 +1,10 @@
before_commit:
cargo fmt --check
cargo clippy --release --all-targets -- -D warnings
cargo clippy --all-targets -- -D warnings
cargo build --release --all-targets
cargo build --all-targets
cargo test
cargo test --release
cargo test --doc
RUSTDOCFLAGS="-D warnings" cargo doc --all-features

283
saphyr/src/annotated.rs Normal file
View file

@ -0,0 +1,283 @@
//! Utilities for extracting YAML with certain metadata.
pub mod marked_yaml;
use std::ops::{Index, IndexMut};
use hashlink::LinkedHashMap;
use crate::loader::parse_f64;
/// YAML data for nodes that will contain annotations.
///
/// If you want a YAML node without annotations, see [`Yaml`].
/// If you want a YAML node with annotations, see types using [`YamlData`] such as [`MarkedYaml`]
///
/// Unlike [`Yaml`] which only supports storing data, [`YamlData`] allows storing metadata
/// alongside the YAML data. It is unlikely one would build it directly; it is mostly intended to
/// be used, for instance, when parsing a YAML where retrieving markers / comments is relevant.
///
/// This definition is recursive. Each annotated node will be a structure storing the annotations
/// and the YAML data. We need to have a distinct enumeration from [`Yaml`] because the type for
/// the `Array` and `Hash` variants is dependant on that structure.
///
/// If we had written [`YamlData`] as:
/// ```ignore
/// pub enum YamlData {
/// // ...
/// Array(Vec<Yaml>),
/// Hash(LinkedHashMap<Yaml, Yaml>),
/// // ...
/// }
/// ```
/// we would have stored metadata for the root node only. All subsequent nodes would be [`Yaml`],
/// which does not contain any annotation.
///
/// Notable differences with [`Yaml`]:
/// * Indexing cannot return `BadValue` and will panic instead.
///
/// [`Yaml`]: crate::Yaml
/// [`MarkedYaml`]: marked_yaml::MarkedYaml
#[derive(Clone, PartialEq, PartialOrd, Debug, Eq, Ord, Hash)]
pub enum YamlData<Node>
where
Node: std::hash::Hash + std::cmp::Eq + From<Self>,
{
/// Float types are stored as String and parsed on demand.
/// Note that `f64` does NOT implement Eq trait and can NOT be stored in `BTreeMap`.
Real(String),
/// YAML int is stored as i64.
Integer(i64),
/// YAML scalar.
String(String),
/// YAML bool, e.g. `true` or `false`.
Boolean(bool),
/// YAML array, can be accessed as a `Vec`.
Array(AnnotatedArray<Node>),
/// YAML hash, can be accessed as a `LinkedHashMap`.
///
/// Insertion order will match the order of insertion into the map.
Hash(AnnotatedHash<Node>),
/// Alias, not fully supported yet.
Alias(usize),
/// YAML null, e.g. `null` or `~`.
Null,
/// Accessing a nonexistent node via the Index trait returns `BadValue`. This
/// simplifies error handling in the calling code. Invalid type conversion also
/// returns `BadValue`.
BadValue,
}
/// The type contained in the [`YamlData::Array`] variant. This corresponds to YAML sequences.
#[allow(clippy::module_name_repetitions)]
pub type AnnotatedArray<Node> = Vec<Node>;
/// The type contained in the [`YamlData::Hash`] variant. This corresponds to YAML mappings.
#[allow(clippy::module_name_repetitions)]
pub type AnnotatedHash<Node> = LinkedHashMap<Node, Node>;
impl<Node> YamlData<Node>
where
Node: std::hash::Hash + std::cmp::Eq + From<Self>,
{
define_as!(as_bool, bool, Boolean);
define_as!(as_i64, i64, Integer);
define_as_ref!(as_hash, &AnnotatedHash<Node>, Hash);
define_as_ref!(as_str, &str, String);
define_as_ref!(as_vec, &AnnotatedArray<Node>, Array);
define_as_mut_ref!(as_mut_hash, &mut AnnotatedHash<Node>, Hash);
define_as_mut_ref!(as_mut_vec, &mut AnnotatedArray<Node>, Array);
define_into!(into_bool, bool, Boolean);
define_into!(into_hash, AnnotatedHash<Node>, Hash);
define_into!(into_i64, i64, Integer);
define_into!(into_string, String, String);
define_into!(into_vec, AnnotatedArray<Node>, Array);
define_is!(is_alias, Self::Alias(_));
define_is!(is_array, Self::Array(_));
define_is!(is_badvalue, Self::BadValue);
define_is!(is_boolean, Self::Boolean(_));
define_is!(is_hash, Self::Hash(_));
define_is!(is_integer, Self::Integer(_));
define_is!(is_null, Self::Null);
define_is!(is_real, Self::Real(_));
define_is!(is_string, Self::String(_));
/// Return the `f64` value contained in this YAML node.
///
/// If the node is not a [`YamlData::Real`] YAML node or its contents is not a valid `f64`
/// string, `None` is returned.
#[must_use]
pub fn as_f64(&self) -> Option<f64> {
if let Self::Real(ref v) = self {
parse_f64(v)
} else {
None
}
}
/// Return the `f64` value contained in this YAML node.
///
/// If the node is not a [`YamlData::Real`] YAML node or its contents is not a valid `f64`
/// string, `None` is returned.
#[must_use]
pub fn into_f64(self) -> Option<f64> {
self.as_f64()
}
/// If a value is null or otherwise bad (see variants), consume it and
/// replace it with a given value `other`. Otherwise, return self unchanged.
///
/// See [`Yaml::or`] for examples.
///
/// [`Yaml::or`]: crate::Yaml::or
#[must_use]
pub fn or(self, other: Self) -> Self {
match self {
Self::BadValue | Self::Null => other,
this => this,
}
}
/// See [`Self::or`] for behavior.
///
/// This performs the same operations, but with borrowed values for less linear pipelines.
#[must_use]
pub fn borrowed_or<'a>(&'a self, other: &'a Self) -> &'a Self {
match self {
Self::BadValue | Self::Null => other,
this => this,
}
}
}
// NOTE(ethiraric, 10/06/2024): We cannot create a "generic static" variable which would act as a
// `BAD_VALUE`. This means that, unlike for `Yaml`, we have to make the indexing method panic.
impl<'a, Node> Index<&'a str> for YamlData<Node>
where
Node: std::hash::Hash + std::cmp::Eq + From<Self>,
{
type Output = Node;
/// Perform indexing if `self` is a mapping.
///
/// # Panics
/// This function panics if the key given does not exist within `self` (as per [`Index`]).
///
/// This function also panics if `self` is not a [`YamlData::Hash`].
fn index(&self, idx: &'a str) -> &Node {
let key = Self::String(idx.to_owned());
match self.as_hash() {
Some(h) => h.get(&key.into()).unwrap(),
None => panic!("{idx}: key does not exist"),
}
}
}
impl<'a, Node> IndexMut<&'a str> for YamlData<Node>
where
Node: std::hash::Hash + std::cmp::Eq + From<Self>,
{
/// Perform indexing if `self` is a mapping.
///
/// # Panics
/// This function panics if the key given does not exist within `self` (as per [`Index`]).
///
/// This function also panics if `self` is not a [`YamlData::Hash`].
fn index_mut(&mut self, idx: &'a str) -> &mut Node {
let key = Self::String(idx.to_owned());
match self.as_mut_hash() {
Some(h) => h.get_mut(&key.into()).unwrap(),
None => panic!("Not a hash type"),
}
}
}
impl<Node> Index<usize> for YamlData<Node>
where
Node: std::hash::Hash + std::cmp::Eq + From<Self>,
{
type Output = Node;
/// Perform indexing if `self` is a sequence or a mapping.
///
/// # Panics
/// This function panics if the index given is out of range (as per [`Index`]). If `self` is a
/// [`YamlData::Array`], this is when the index is bigger or equal to the length of the
/// underlying `Vec`. If `self` is a [`YamlData::Hash`], this is when the mapping sequence does
/// not contain [`YamlData::Integer`]`(idx)` as a key.
///
/// This function also panics if `self` is not a [`YamlData::Array`] nor a [`YamlData::Hash`].
fn index(&self, idx: usize) -> &Node {
if let Some(v) = self.as_vec() {
v.get(idx).unwrap()
} else if let Some(v) = self.as_hash() {
let key = Self::Integer(i64::try_from(idx).unwrap());
v.get(&key.into()).unwrap()
} else {
panic!("{idx}: Index out of bounds");
}
}
}
impl<Node> IndexMut<usize> for YamlData<Node>
where
Node: std::hash::Hash + std::cmp::Eq + From<Self>,
{
/// Perform indexing if `self` is a sequence or a mapping.
///
/// # Panics
/// This function panics if the index given is out of range (as per [`IndexMut`]). If `self` is
/// a [`YamlData::Array`], this is when the index is bigger or equal to the length of the
/// underlying `Vec`. If `self` is a [`YamlData::Hash`], this is when the mapping sequence does
/// not contain [`YamlData::Integer`]`(idx)` as a key.
///
/// This function also panics if `self` is not a [`YamlData::Array`] nor a [`YamlData::Hash`].
fn index_mut(&mut self, idx: usize) -> &mut Node {
match self {
Self::Array(sequence) => sequence.index_mut(idx),
Self::Hash(mapping) => {
let key = Self::Integer(i64::try_from(idx).unwrap());
mapping.get_mut(&key.into()).unwrap()
}
_ => panic!("Attempting to index but `self` is not a sequence nor a mapping"),
}
}
}
impl<Node> IntoIterator for YamlData<Node>
where
Node: std::hash::Hash + std::cmp::Eq + From<Self>,
{
type Item = Node;
type IntoIter = AnnotatedYamlIter<Node>;
fn into_iter(self) -> Self::IntoIter {
Self::IntoIter {
yaml: self.into_vec().unwrap_or_default().into_iter(),
}
}
}
/// An iterator over a [`YamlData`] node.
#[allow(clippy::module_name_repetitions)]
pub struct AnnotatedYamlIter<Node>
where
Node: std::hash::Hash + std::cmp::Eq + From<YamlData<Node>>,
{
yaml: std::vec::IntoIter<Node>,
}
impl<Node> Iterator for AnnotatedYamlIter<Node>
where
Node: std::hash::Hash + std::cmp::Eq + From<YamlData<Node>>,
{
type Item = Node;
fn next(&mut self) -> Option<Node> {
self.yaml.next()
}
}

View file

@ -0,0 +1,152 @@
//! A YAML node with position in the source document.
//!
//! This is set aside so as to not clutter `annotated.rs`.
use hashlink::LinkedHashMap;
use saphyr_parser::{Marker, Parser, ScanError};
use crate::{LoadableYamlNode, Yaml, YamlData, YamlLoader};
/// A YAML node with [`Marker`]s pointing to the start of the node.
///
/// This structure does not implement functions to operate on the YAML object. To access those,
/// refer to the [`Self::data`] field.
#[derive(Clone, Debug)]
pub struct MarkedYaml {
/// The marker pointing to the start of the node.
///
/// The marker is relative to the start of the input stream that was given to the parser, not
/// to the start of the document within the input stream.
pub marker: Marker,
/// The YAML contents of the node.
pub data: YamlData<MarkedYaml>,
}
impl MarkedYaml {
/// Load the given string as an array of YAML documents.
///
/// See the function [`load_from_str`] for more details.
///
/// # Errors
/// Returns `ScanError` when loading fails.
///
/// [`load_from_str`]: `Yaml::load_from_str`
pub fn load_from_str(source: &str) -> Result<Vec<Self>, ScanError> {
Self::load_from_iter(source.chars())
}
/// Load the contents of the given iterator as an array of YAML documents.
///
/// See the function [`load_from_str`] for more details.
///
/// # Errors
/// Returns `ScanError` when loading fails.
///
/// [`load_from_str`]: `Yaml::load_from_str`
pub fn load_from_iter<I: Iterator<Item = char>>(source: I) -> Result<Vec<Self>, ScanError> {
let mut parser = Parser::new(source);
Self::load_from_parser(&mut parser)
}
/// Load the contents from the specified [`Parser`] as an array of YAML documents.
///
/// See the function [`load_from_str`] for more details.
///
/// # Errors
/// Returns `ScanError` when loading fails.
///
/// [`load_from_str`]: `Yaml::load_from_str`
pub fn load_from_parser<I: Iterator<Item = char>>(
parser: &mut Parser<I>,
) -> Result<Vec<Self>, ScanError> {
let mut loader = YamlLoader::<Self>::default();
parser.load(&mut loader, true)?;
Ok(loader.into_documents())
}
}
impl PartialEq for MarkedYaml {
fn eq(&self, other: &Self) -> bool {
self.data.eq(&other.data)
}
}
// I don't know if it's okay to implement that, but we need it for the hashmap.
impl Eq for MarkedYaml {}
impl std::hash::Hash for MarkedYaml {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.data.hash(state);
}
}
impl From<YamlData<MarkedYaml>> for MarkedYaml {
fn from(value: YamlData<MarkedYaml>) -> Self {
Self {
marker: Marker::default(),
data: value,
}
}
}
impl LoadableYamlNode for MarkedYaml {
fn from_bare_yaml(yaml: Yaml) -> Self {
Self {
marker: Marker::default(),
data: match yaml {
Yaml::Real(x) => YamlData::Real(x),
Yaml::Integer(x) => YamlData::Integer(x),
Yaml::String(x) => YamlData::String(x),
Yaml::Boolean(x) => YamlData::Boolean(x),
// Array and Hash will always have their container empty.
Yaml::Array(_) => YamlData::Array(vec![]),
Yaml::Hash(_) => YamlData::Hash(LinkedHashMap::new()),
Yaml::Alias(x) => YamlData::Alias(x),
Yaml::Null => YamlData::Null,
Yaml::BadValue => YamlData::BadValue,
},
}
}
fn is_array(&self) -> bool {
self.data.is_array()
}
fn is_hash(&self) -> bool {
self.data.is_hash()
}
fn is_badvalue(&self) -> bool {
self.data.is_badvalue()
}
fn array_mut(&mut self) -> &mut Vec<Self> {
if let YamlData::Array(x) = &mut self.data {
x
} else {
panic!("Called array_mut on a non-array");
}
}
fn hash_mut(&mut self) -> &mut LinkedHashMap<Self, Self> {
if let YamlData::Hash(x) = &mut self.data {
x
} else {
panic!("Called array_mut on a non-array");
}
}
fn take(&mut self) -> Self {
let mut taken_out = MarkedYaml {
marker: Marker::default(),
data: YamlData::BadValue,
};
std::mem::swap(&mut taken_out, self);
taken_out
}
fn with_marker(mut self, marker: Marker) -> Self {
self.marker = marker;
self
}
}

14
saphyr/src/char_traits.rs Normal file
View file

@ -0,0 +1,14 @@
//! Holds functions to determine if a character belongs to a specific character set.
/// Check if the string can be expressed a valid literal block scalar.
/// The YAML spec supports all of the following in block literals except `#xFEFF`:
/// ```no_compile
/// #x9 | #xA | [#x20-#x7E] /* 8 bit */
/// | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] /* 16 bit */
/// | [#x10000-#x10FFFF] /* 32 bit */
/// ```
#[inline]
pub(crate) fn is_valid_literal_block_scalar(string: &str) -> bool {
string.chars().all(|character: char|
matches!(character, '\t' | '\n' | '\x20'..='\x7e' | '\u{0085}' | '\u{00a0}'..='\u{d7fff}'))
}

437
saphyr/src/emitter.rs Normal file
View file

@ -0,0 +1,437 @@
//! YAML serialization helpers.
use crate::char_traits;
use crate::yaml::{Hash, Yaml};
use std::convert::From;
use std::error::Error;
use std::fmt::{self, Display};
/// An error when emitting YAML.
#[derive(Copy, Clone, Debug)]
pub enum EmitError {
/// A formatting error.
FmtError(fmt::Error),
}
impl Error for EmitError {
fn cause(&self) -> Option<&dyn Error> {
None
}
}
impl Display for EmitError {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
match *self {
EmitError::FmtError(ref err) => Display::fmt(err, formatter),
}
}
}
impl From<fmt::Error> for EmitError {
fn from(f: fmt::Error) -> Self {
EmitError::FmtError(f)
}
}
/// The YAML serializer.
///
/// ```
/// # use saphyr::{Yaml, YamlEmitter};
/// let input_string = "a: b\nc: d";
/// let yaml = Yaml::load_from_str(input_string).unwrap();
///
/// let mut output = String::new();
/// YamlEmitter::new(&mut output).dump(&yaml[0]).unwrap();
///
/// assert_eq!(output, r#"---
/// a: b
/// c: d"#);
/// ```
#[allow(clippy::module_name_repetitions)]
pub struct YamlEmitter<'a> {
writer: &'a mut dyn fmt::Write,
best_indent: usize,
compact: bool,
level: isize,
multiline_strings: bool,
}
/// A convenience alias for emitter functions that may fail without returning a value.
pub type EmitResult = Result<(), EmitError>;
// from serialize::json
fn escape_str(wr: &mut dyn fmt::Write, v: &str) -> Result<(), fmt::Error> {
wr.write_str("\"")?;
let mut start = 0;
for (i, byte) in v.bytes().enumerate() {
let escaped = match byte {
b'"' => "\\\"",
b'\\' => "\\\\",
b'\x00' => "\\u0000",
b'\x01' => "\\u0001",
b'\x02' => "\\u0002",
b'\x03' => "\\u0003",
b'\x04' => "\\u0004",
b'\x05' => "\\u0005",
b'\x06' => "\\u0006",
b'\x07' => "\\u0007",
b'\x08' => "\\b",
b'\t' => "\\t",
b'\n' => "\\n",
b'\x0b' => "\\u000b",
b'\x0c' => "\\f",
b'\r' => "\\r",
b'\x0e' => "\\u000e",
b'\x0f' => "\\u000f",
b'\x10' => "\\u0010",
b'\x11' => "\\u0011",
b'\x12' => "\\u0012",
b'\x13' => "\\u0013",
b'\x14' => "\\u0014",
b'\x15' => "\\u0015",
b'\x16' => "\\u0016",
b'\x17' => "\\u0017",
b'\x18' => "\\u0018",
b'\x19' => "\\u0019",
b'\x1a' => "\\u001a",
b'\x1b' => "\\u001b",
b'\x1c' => "\\u001c",
b'\x1d' => "\\u001d",
b'\x1e' => "\\u001e",
b'\x1f' => "\\u001f",
b'\x7f' => "\\u007f",
_ => continue,
};
if start < i {
wr.write_str(&v[start..i])?;
}
wr.write_str(escaped)?;
start = i + 1;
}
if start != v.len() {
wr.write_str(&v[start..])?;
}
wr.write_str("\"")?;
Ok(())
}
impl<'a> YamlEmitter<'a> {
/// Create a new emitter serializing into `writer`.
pub fn new(writer: &'a mut dyn fmt::Write) -> YamlEmitter {
YamlEmitter {
writer,
best_indent: 2,
compact: true,
level: -1,
multiline_strings: false,
}
}
/// Set 'compact inline notation' on or off, as described for block
/// [sequences](http://www.yaml.org/spec/1.2/spec.html#id2797382)
/// and
/// [mappings](http://www.yaml.org/spec/1.2/spec.html#id2798057).
///
/// In this form, blocks cannot have any properties (such as anchors
/// or tags), which should be OK, because this emitter doesn't
/// (currently) emit those anyways.
///
/// TODO(ethiraric, 2024/04/02): We can support those now.
pub fn compact(&mut self, compact: bool) {
self.compact = compact;
}
/// Determine if this emitter is using 'compact inline notation'.
#[must_use]
pub fn is_compact(&self) -> bool {
self.compact
}
/// Render strings containing multiple lines in [literal style].
///
/// # Examples
///
/// ```rust
/// use saphyr::{Yaml, YamlEmitter};
///
/// let input = r#"{foo: "bar!\nbar!", baz: 42}"#;
/// let parsed = Yaml::load_from_str(input).unwrap();
/// eprintln!("{:?}", parsed);
///
/// let mut output = String::new();
/// let mut emitter = YamlEmitter::new(&mut output);
/// emitter.multiline_strings(true);
/// emitter.dump(&parsed[0]).unwrap();
/// assert_eq!(output.as_str(), "\
/// ---
/// foo: |-
/// bar!
/// bar!
/// baz: 42");
/// ```
///
/// [literal style]: https://yaml.org/spec/1.2/spec.html#id2795688
pub fn multiline_strings(&mut self, multiline_strings: bool) {
self.multiline_strings = multiline_strings;
}
/// Determine if this emitter will emit multiline strings when appropriate.
#[must_use]
pub fn is_multiline_strings(&self) -> bool {
self.multiline_strings
}
/// Dump Yaml to an output stream.
/// # Errors
/// Returns `EmitError` when an error occurs.
pub fn dump(&mut self, doc: &Yaml) -> EmitResult {
// write DocumentStart
writeln!(self.writer, "---")?;
self.level = -1;
self.emit_node(doc)
}
fn write_indent(&mut self) -> EmitResult {
if self.level <= 0 {
return Ok(());
}
for _ in 0..self.level {
for _ in 0..self.best_indent {
write!(self.writer, " ")?;
}
}
Ok(())
}
fn emit_node(&mut self, node: &Yaml) -> EmitResult {
match *node {
Yaml::Array(ref v) => self.emit_array(v),
Yaml::Hash(ref h) => self.emit_hash(h),
Yaml::String(ref v) => {
if self.multiline_strings
&& v.contains('\n')
&& char_traits::is_valid_literal_block_scalar(v)
{
self.emit_literal_block(v)?;
} else if need_quotes(v) {
escape_str(self.writer, v)?;
} else {
write!(self.writer, "{v}")?;
}
Ok(())
}
Yaml::Boolean(v) => {
if v {
self.writer.write_str("true")?;
} else {
self.writer.write_str("false")?;
}
Ok(())
}
Yaml::Integer(v) => {
write!(self.writer, "{v}")?;
Ok(())
}
Yaml::Real(ref v) => {
write!(self.writer, "{v}")?;
Ok(())
}
Yaml::Null | Yaml::BadValue => {
write!(self.writer, "~")?;
Ok(())
}
// XXX(chenyh) Alias
Yaml::Alias(_) => Ok(()),
}
}
fn emit_literal_block(&mut self, v: &str) -> EmitResult {
let ends_with_newline = v.ends_with('\n');
if ends_with_newline {
self.writer.write_str("|")?;
} else {
self.writer.write_str("|-")?;
}
self.level += 1;
// lines() will omit the last line if it is empty.
for line in v.lines() {
writeln!(self.writer)?;
self.write_indent()?;
// It's literal text, so don't escape special chars.
self.writer.write_str(line)?;
}
self.level -= 1;
Ok(())
}
fn emit_array(&mut self, v: &[Yaml]) -> EmitResult {
if v.is_empty() {
write!(self.writer, "[]")?;
} else {
self.level += 1;
for (cnt, x) in v.iter().enumerate() {
if cnt > 0 {
writeln!(self.writer)?;
self.write_indent()?;
}
write!(self.writer, "-")?;
self.emit_val(true, x)?;
}
self.level -= 1;
}
Ok(())
}
fn emit_hash(&mut self, h: &Hash) -> EmitResult {
if h.is_empty() {
self.writer.write_str("{}")?;
} else {
self.level += 1;
for (cnt, (k, v)) in h.iter().enumerate() {
let complex_key = matches!(*k, Yaml::Hash(_) | Yaml::Array(_));
if cnt > 0 {
writeln!(self.writer)?;
self.write_indent()?;
}
if complex_key {
write!(self.writer, "?")?;
self.emit_val(true, k)?;
writeln!(self.writer)?;
self.write_indent()?;
write!(self.writer, ":")?;
self.emit_val(true, v)?;
} else {
self.emit_node(k)?;
write!(self.writer, ":")?;
self.emit_val(false, v)?;
}
}
self.level -= 1;
}
Ok(())
}
/// Emit a yaml as a hash or array value: i.e., which should appear
/// following a ":" or "-", either after a space, or on a new line.
/// If `inline` is true, then the preceding characters are distinct
/// and short enough to respect the compact flag.
fn emit_val(&mut self, inline: bool, val: &Yaml) -> EmitResult {
match *val {
Yaml::Array(ref v) => {
if (inline && self.compact) || v.is_empty() {
write!(self.writer, " ")?;
} else {
writeln!(self.writer)?;
self.level += 1;
self.write_indent()?;
self.level -= 1;
}
self.emit_array(v)
}
Yaml::Hash(ref h) => {
if (inline && self.compact) || h.is_empty() {
write!(self.writer, " ")?;
} else {
writeln!(self.writer)?;
self.level += 1;
self.write_indent()?;
self.level -= 1;
}
self.emit_hash(h)
}
_ => {
write!(self.writer, " ")?;
self.emit_node(val)
}
}
}
}
/// Check if the string requires quoting.
/// Strings starting with any of the following characters must be quoted.
/// :, &, *, ?, |, -, <, >, =, !, %, @
/// Strings containing any of the following characters must be quoted.
/// {, }, \[, t \], ,, #, `
///
/// If the string contains any of the following control characters, it must be escaped with double quotes:
/// \0, \x01, \x02, \x03, \x04, \x05, \x06, \a, \b, \t, \n, \v, \f, \r, \x0e, \x0f, \x10, \x11, \x12, \x13, \x14, \x15, \x16, \x17, \x18, \x19, \x1a, \e, \x1c, \x1d, \x1e, \x1f, \N, \_, \L, \P
///
/// Finally, there are other cases when the strings must be quoted, no matter if you're using single or double quotes:
/// * When the string is true or false (otherwise, it would be treated as a boolean value);
/// * When the string is null or ~ (otherwise, it would be considered as a null value);
/// * When the string looks like a number, such as integers (e.g. 2, 14, etc.), floats (e.g. 2.6, 14.9) and exponential numbers (e.g. 12e7, etc.) (otherwise, it would be treated as a numeric value);
/// * When the string looks like a date (e.g. 2014-12-31) (otherwise it would be automatically converted into a Unix timestamp).
#[allow(clippy::doc_markdown)]
fn need_quotes(string: &str) -> bool {
fn need_quotes_spaces(string: &str) -> bool {
string.starts_with(' ') || string.ends_with(' ')
}
string.is_empty()
|| need_quotes_spaces(string)
|| string.starts_with(|character: char| {
matches!(
character,
'&' | '*' | '?' | '|' | '-' | '<' | '>' | '=' | '!' | '%' | '@'
)
})
|| string.contains(|character: char| {
matches!(character, ':'
| '{'
| '}'
| '['
| ']'
| ','
| '#'
| '`'
| '\"'
| '\''
| '\\'
| '\0'..='\x06'
| '\t'
| '\n'
| '\r'
| '\x0e'..='\x1a'
| '\x1c'..='\x1f')
})
|| [
// http://yaml.org/type/bool.html
// Note: 'y', 'Y', 'n', 'N', is not quoted deliberately, as in libyaml. PyYAML also parse
// them as string, not booleans, although it is violating the YAML 1.1 specification.
// See https://github.com/dtolnay/serde-yaml/pull/83#discussion_r152628088.
"yes", "Yes", "YES", "no", "No", "NO", "True", "TRUE", "true", "False", "FALSE",
"false", "on", "On", "ON", "off", "Off", "OFF",
// http://yaml.org/type/null.html
"null", "Null", "NULL", "~",
]
.contains(&string)
|| string.starts_with('.')
|| string.starts_with("0x")
|| string.parse::<i64>().is_ok()
|| string.parse::<f64>().is_ok()
}
#[cfg(test)]
mod test {
use crate::Yaml;
use super::YamlEmitter;
#[test]
fn test_multiline_string() {
let input = r#"{foo: "bar!\nbar!", baz: 42}"#;
let parsed = Yaml::load_from_str(input).unwrap();
let mut output = String::new();
let mut emitter = YamlEmitter::new(&mut output);
emitter.multiline_strings(true);
emitter.dump(&parsed[0]).unwrap();
}
}

289
saphyr/src/encoding.rs Normal file
View file

@ -0,0 +1,289 @@
//! Encoding utilities. Available only with the `encoding` feature.
use std::{borrow::Cow, ops::ControlFlow};
use encoding_rs::{Decoder, DecoderResult, Encoding};
use crate::{loader::LoadError, Yaml};
/// The signature of the function to call when using [`YAMLDecodingTrap::Call`].
///
/// The arguments are as follows:
/// * `malformation_length`: The length of the sequence the decoder failed to decode.
/// * `bytes_read_after_malformation`: The number of lookahead bytes the decoder consumed after
/// the malformation.
/// * `input_at_malformation`: What the input buffer is at the malformation.
/// This is the buffer starting at the malformation. The first `malformation_length` bytes are
/// the problematic sequence. The following `bytes_read_after_malformation` are already stored
/// in the decoder and will not be re-fed.
/// * `output`: The output string.
///
/// The function must modify `output` as it feels is best. For instance, one could recreate the
/// behavior of [`YAMLDecodingTrap::Ignore`] with an empty function, [`YAMLDecodingTrap::Replace`]
/// by pushing a `\u{FFFD}` into `output` and [`YAMLDecodingTrap::Strict`] by returning
/// [`ControlFlow::Break`].
///
/// # Returns
/// The function must return [`ControlFlow::Continue`] if decoding may continue or
/// [`ControlFlow::Break`] if decoding must be aborted. An optional error string may be supplied.
pub type YAMLDecodingTrapFn = fn(
malformation_length: u8,
bytes_read_after_malformation: u8,
input_at_malformation: &[u8],
output: &mut String,
) -> ControlFlow<Cow<'static, str>>;
/// The behavior [`YamlDecoder`] must have when an decoding error occurs.
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum YAMLDecodingTrap {
/// Ignore the offending bytes, remove them from the output.
Ignore,
/// Error out.
Strict,
/// Replace them with the Unicode REPLACEMENT CHARACTER.
Replace,
/// Call the user-supplied function upon decoding malformation.
Call(YAMLDecodingTrapFn),
}
/// `YamlDecoder` is a `YamlLoader` builder that allows you to supply your own encoding error trap.
/// For example, to read a YAML file while ignoring Unicode decoding errors you can set the
/// `encoding_trap` to `encoding::DecoderTrap::Ignore`.
/// ```rust
/// use saphyr::{YamlDecoder, YAMLDecodingTrap};
///
/// let string = b"---
/// a\xa9: 1
/// b: 2.2
/// c: [1, 2]
/// ";
/// let out = YamlDecoder::read(string as &[u8])
/// .encoding_trap(YAMLDecodingTrap::Ignore)
/// .decode()
/// .unwrap();
/// ```
pub struct YamlDecoder<T: std::io::Read> {
/// The input stream.
source: T,
/// The behavior to adopt when encountering a malformed encoding.
trap: YAMLDecodingTrap,
}
impl<T: std::io::Read> YamlDecoder<T> {
/// Create a `YamlDecoder` decoding the given source.
pub fn read(source: T) -> YamlDecoder<T> {
YamlDecoder {
source,
trap: YAMLDecodingTrap::Strict,
}
}
/// Set the behavior of the decoder when the encoding is invalid.
pub fn encoding_trap(&mut self, trap: YAMLDecodingTrap) -> &mut Self {
self.trap = trap;
self
}
/// Run the decode operation with the source and trap the `YamlDecoder` was built with.
///
/// # Errors
/// Returns `LoadError` when decoding fails.
pub fn decode(&mut self) -> Result<Vec<Yaml>, LoadError> {
let mut buffer = Vec::new();
self.source.read_to_end(&mut buffer)?;
// Check if the `encoding` library can detect encoding from the BOM, otherwise use
// `detect_utf16_endianness`.
let (encoding, _) =
Encoding::for_bom(&buffer).unwrap_or_else(|| (detect_utf16_endianness(&buffer), 2));
let mut decoder = encoding.new_decoder();
let mut output = String::new();
// Decode the input buffer.
decode_loop(&buffer, &mut output, &mut decoder, self.trap)?;
Yaml::load_from_str(&output).map_err(LoadError::Scan)
}
}
/// Perform a loop of [`Decoder::decode_to_string`], reallocating `output` if needed.
fn decode_loop(
input: &[u8],
output: &mut String,
decoder: &mut Decoder,
trap: YAMLDecodingTrap,
) -> Result<(), LoadError> {
use crate::loader::LoadError;
output.reserve(input.len());
let mut total_bytes_read = 0;
loop {
match decoder.decode_to_string_without_replacement(&input[total_bytes_read..], output, true)
{
// If the input is empty, we processed the whole input.
(DecoderResult::InputEmpty, _) => break Ok(()),
// If the output is full, we must reallocate.
(DecoderResult::OutputFull, bytes_read) => {
total_bytes_read += bytes_read;
// The output is already reserved to the size of the input. We slowly resize. Here,
// we're expecting that 10% of bytes will double in size when converting to UTF-8.
output.reserve(input.len() / 10);
}
(DecoderResult::Malformed(malformed_len, bytes_after_malformed), bytes_read) => {
total_bytes_read += bytes_read;
match trap {
// Ignore (skip over) malformed character.
YAMLDecodingTrap::Ignore => {}
// Replace them with the Unicode REPLACEMENT CHARACTER.
YAMLDecodingTrap::Replace => {
output.push('\u{FFFD}');
}
// Otherwise error, getting as much context as possible.
YAMLDecodingTrap::Strict => {
let malformed_len = malformed_len as usize;
let bytes_after_malformed = bytes_after_malformed as usize;
let byte_idx = total_bytes_read - (malformed_len + bytes_after_malformed);
let malformed_sequence = &input[byte_idx..byte_idx + malformed_len];
break Err(LoadError::Decode(Cow::Owned(format!(
"Invalid character sequence at {byte_idx}: {malformed_sequence:?}",
))));
}
YAMLDecodingTrap::Call(callback) => {
let byte_idx =
total_bytes_read - ((malformed_len + bytes_after_malformed) as usize);
let malformed_sequence =
&input[byte_idx..byte_idx + malformed_len as usize];
if let ControlFlow::Break(error) = callback(
malformed_len,
bytes_after_malformed,
&input[byte_idx..],
output,
) {
if error.is_empty() {
break Err(LoadError::Decode(Cow::Owned(format!(
"Invalid character sequence at {byte_idx}: {malformed_sequence:?}",
))));
}
break Err(LoadError::Decode(error));
}
}
}
}
}
}
}
/// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the
/// bytestream starts with BOM codepoint.
/// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since
/// in the general case the bytestream could start with a codepoint that uses both bytes.
///
/// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character.
/// This allows the encoding to be deduced by the pattern of null (#x00) characters.
//
/// See spec at <https://yaml.org/spec/1.2/spec.html#id2771184>
fn detect_utf16_endianness(b: &[u8]) -> &'static Encoding {
if b.len() > 1 && (b[0] != b[1]) {
if b[0] == 0 {
return encoding_rs::UTF_16BE;
} else if b[1] == 0 {
return encoding_rs::UTF_16LE;
}
}
encoding_rs::UTF_8
}
#[cfg(test)]
mod test {
use super::{YAMLDecodingTrap, Yaml, YamlDecoder};
#[test]
fn test_read_bom() {
let s = b"\xef\xbb\xbf---
a: 1
b: 2.2
c: [1, 2]
";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_utf16le() {
let s = b"\xff\xfe-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
\x00";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64) <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_utf16be() {
let s = b"\xfe\xff\x00-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_utf16le_nobom() {
let s = b"-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
\x00";
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_read_trap() {
let s = b"---
a\xa9: 1
b: 2.2
c: [1, 2]
";
let out = YamlDecoder::read(s as &[u8])
.encoding_trap(YAMLDecodingTrap::Ignore)
.decode()
.unwrap();
let doc = &out[0];
println!("GOT: {doc:?}");
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_or() {
assert_eq!(Yaml::Null.or(Yaml::Integer(3)), Yaml::Integer(3));
assert_eq!(Yaml::Integer(3).or(Yaml::Integer(7)), Yaml::Integer(3));
}
}

71
saphyr/src/lib.rs Normal file
View file

@ -0,0 +1,71 @@
// Copyright 2015, Yuheng Chen.
// Copyright 2023, Ethiraric.
// See the LICENSE file at the top-level directory of this distribution.
//! YAML 1.2 implementation in pure Rust.
//!
//! # Usage
//!
//! This crate is [on github](https://github.com/saphyr-rs/saphyr) and can be used by adding
//! `saphyr` to the dependencies in your project's `Cargo.toml`.
//! ```toml
//! [dependencies]
//! saphyr = "0.0.1"
//! ```
//! or by using `cargo add` to get the latest version:
//! ```sh
//! cargo add saphyr
//! ```
//!
//! # Examples
//! Parse a string into `Vec<Yaml>` and then serialize it as a YAML string.
//!
//! ```
//! use saphyr::{Yaml, YamlEmitter};
//!
//! let docs = Yaml::load_from_str("[1, 2, 3]").unwrap();
//! let doc = &docs[0]; // select the first YAML document
//! assert_eq!(doc[0].as_i64().unwrap(), 1); // access elements by index
//!
//! let mut out_str = String::new();
//! let mut emitter = YamlEmitter::new(&mut out_str);
//! emitter.dump(doc).unwrap(); // dump the YAML object to a String
//! ```
//!
//! # Features
//! **Note:** With all features disabled, this crate's MSRV is `1.65.0`.
//!
//! #### `encoding` (_enabled by default_)
//! Enables encoding-aware decoding of Yaml documents.
//!
//! The MSRV for this feature is `1.70.0`.
#![warn(missing_docs, clippy::pedantic)]
#[macro_use]
mod macros;
mod annotated;
mod char_traits;
mod emitter;
mod loader;
mod yaml;
// Re-export main components.
pub use crate::annotated::{
marked_yaml::MarkedYaml, AnnotatedArray, AnnotatedHash, AnnotatedYamlIter, YamlData,
};
pub use crate::emitter::YamlEmitter;
pub use crate::loader::{LoadableYamlNode, YamlLoader};
pub use crate::yaml::{Array, Hash, Yaml, YamlIter};
#[cfg(feature = "encoding")]
mod encoding;
#[cfg(feature = "encoding")]
pub use crate::encoding::{YAMLDecodingTrap, YAMLDecodingTrapFn, YamlDecoder};
// Re-export `ScanError` as it is used as part of our public API and we want consumers to be able
// to inspect it (e.g. perform a `match`). They wouldn't be able without it.
pub use saphyr_parser::ScanError;
// Re-export [`Marker`] which is used for annotated YAMLs.
pub use saphyr_parser::Marker;

310
saphyr/src/loader.rs Normal file
View file

@ -0,0 +1,310 @@
//! The default loader.
use std::{collections::BTreeMap, sync::Arc};
use hashlink::LinkedHashMap;
use saphyr_parser::{Event, MarkedEventReceiver, Marker, ScanError, TScalarStyle, Tag};
use crate::{Hash, Yaml};
/// Main structure for parsing YAML.
///
/// The `YamlLoader` may load raw YAML documents or add metadata if needed. The type of the `Node`
/// dictates what data and metadata the loader will add to the `Node`.
///
/// Each node must implement [`LoadableYamlNode`]. The methods are required for the loader to
/// manipulate and populate the `Node`.
#[allow(clippy::module_name_repetitions)]
pub struct YamlLoader<Node>
where
Node: LoadableYamlNode,
{
/// The different YAML documents that are loaded.
docs: Vec<Node>,
// states
// (current node, anchor_id) tuple
doc_stack: Vec<(Node, usize)>,
key_stack: Vec<Node>,
anchor_map: BTreeMap<usize, Node>,
}
// For some reason, rustc wants `Node: Default` if I `#[derive(Default)]`.
impl<Node> Default for YamlLoader<Node>
where
Node: LoadableYamlNode,
{
fn default() -> Self {
Self {
docs: vec![],
doc_stack: vec![],
key_stack: vec![],
anchor_map: BTreeMap::new(),
}
}
}
impl<Node> MarkedEventReceiver for YamlLoader<Node>
where
Node: LoadableYamlNode,
{
fn on_event(&mut self, ev: Event, marker: Marker) {
match ev {
Event::DocumentStart | Event::Nothing | Event::StreamStart | Event::StreamEnd => {
// do nothing
}
Event::DocumentEnd => {
match self.doc_stack.len() {
// empty document
0 => self
.docs
.push(Node::from_bare_yaml(Yaml::BadValue).with_marker(marker)),
1 => self.docs.push(self.doc_stack.pop().unwrap().0),
_ => unreachable!(),
}
}
Event::SequenceStart(aid, _) => {
self.doc_stack.push((
Node::from_bare_yaml(Yaml::Array(Vec::new())).with_marker(marker),
aid,
));
}
Event::SequenceEnd => {
let node = self.doc_stack.pop().unwrap();
self.insert_new_node(node);
}
Event::MappingStart(aid, _) => {
self.doc_stack.push((
Node::from_bare_yaml(Yaml::Hash(Hash::new())).with_marker(marker),
aid,
));
self.key_stack.push(Node::from_bare_yaml(Yaml::BadValue));
}
Event::MappingEnd => {
self.key_stack.pop().unwrap();
let node = self.doc_stack.pop().unwrap();
self.insert_new_node(node);
}
Event::Scalar(v, style, aid, tag) => {
let node = if style != TScalarStyle::Plain {
Yaml::String(v)
} else if let Some(Tag {
ref handle,
ref suffix,
}) = tag
{
if handle == "tag:yaml.org,2002:" {
match suffix.as_ref() {
"bool" => {
// "true" or "false"
match v.parse::<bool>() {
Err(_) => Yaml::BadValue,
Ok(v) => Yaml::Boolean(v),
}
}
"int" => match v.parse::<i64>() {
Err(_) => Yaml::BadValue,
Ok(v) => Yaml::Integer(v),
},
"float" => match parse_f64(&v) {
Some(_) => Yaml::Real(v),
None => Yaml::BadValue,
},
"null" => match v.as_ref() {
"~" | "null" => Yaml::Null,
_ => Yaml::BadValue,
},
_ => Yaml::String(v),
}
} else {
Yaml::String(v)
}
} else {
// Datatype is not specified, or unrecognized
Yaml::from_str(&v)
};
self.insert_new_node((Node::from_bare_yaml(node).with_marker(marker), aid));
}
Event::Alias(id) => {
let n = match self.anchor_map.get(&id) {
Some(v) => v.clone(),
None => Node::from_bare_yaml(Yaml::BadValue),
};
self.insert_new_node((n.with_marker(marker), 0));
}
}
}
}
impl<Node> YamlLoader<Node>
where
Node: LoadableYamlNode,
{
fn insert_new_node(&mut self, node: (Node, usize)) {
// valid anchor id starts from 1
if node.1 > 0 {
self.anchor_map.insert(node.1, node.0.clone());
}
if let Some(parent) = self.doc_stack.last_mut() {
let parent_node = &mut parent.0;
if parent_node.is_array() {
parent_node.array_mut().push(node.0);
} else if parent_node.is_hash() {
let cur_key = self.key_stack.last_mut().unwrap();
// current node is a key
if cur_key.is_badvalue() {
*cur_key = node.0;
// current node is a value
} else {
let hash = parent_node.hash_mut();
hash.insert(cur_key.take(), node.0);
}
}
} else {
self.doc_stack.push(node);
}
}
/// Return the document nodes from `self`, consuming it in the process.
#[must_use]
pub fn into_documents(self) -> Vec<Node> {
self.docs
}
}
/// An error that happened when loading a YAML document.
#[derive(Debug, Clone)]
pub enum LoadError {
/// An I/O error.
IO(Arc<std::io::Error>),
/// An error within the scanner. This indicates a malformed YAML input.
Scan(ScanError),
/// A decoding error (e.g.: Invalid UTF-8).
Decode(std::borrow::Cow<'static, str>),
}
impl From<std::io::Error> for LoadError {
fn from(error: std::io::Error) -> Self {
LoadError::IO(Arc::new(error))
}
}
impl std::error::Error for LoadError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
Some(match &self {
LoadError::IO(e) => e,
LoadError::Scan(e) => e,
LoadError::Decode(_) => return None,
})
}
}
impl std::fmt::Display for LoadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
LoadError::IO(e) => e.fmt(f),
LoadError::Scan(e) => e.fmt(f),
LoadError::Decode(e) => e.fmt(f),
}
}
}
/// A trait providing methods used by the [`YamlLoader`].
///
/// This trait must be implemented on YAML node types (i.e.: [`Yaml`] and annotated YAML nodes). It
/// provides the necessary methods for [`YamlLoader`] to load data into the node.
pub trait LoadableYamlNode: Clone + std::hash::Hash + Eq {
/// Create an instance of `Self` from a [`Yaml`].
///
/// Nodes must implement this to be built. The optional metadata that they contain will be
/// later provided by the loader and can be default initialized. The [`Yaml`] object passed as
/// parameter may be of the [`Array`] or [`Hash`] variants. In this event, the inner container
/// will always be empty. There is no need to traverse all elements to convert them from
/// [`Yaml`] to `Self`.
///
/// [`Array`]: `Yaml::Array`
/// [`Hash`]: `Yaml::Hash`
fn from_bare_yaml(yaml: Yaml) -> Self;
/// Return whether the YAML node is an array.
fn is_array(&self) -> bool;
/// Return whether the YAML node is a hash.
fn is_hash(&self) -> bool;
/// Return whether the YAML node is `BadValue`.
fn is_badvalue(&self) -> bool;
/// Retrieve the array variant of the YAML node.
///
/// # Panics
/// This function panics if `self` is not an array.
fn array_mut(&mut self) -> &mut Vec<Self>;
/// Retrieve the hash variant of the YAML node.
///
/// # Panics
/// This function panics if `self` is not a hash.
fn hash_mut(&mut self) -> &mut LinkedHashMap<Self, Self>;
/// Take the contained node out of `Self`, leaving a `BadValue` in its place.
#[must_use]
fn take(&mut self) -> Self;
/// Provide the marker for the node (builder-style).
#[inline]
#[must_use]
fn with_marker(self, _: Marker) -> Self {
self
}
}
impl LoadableYamlNode for Yaml {
fn from_bare_yaml(yaml: Yaml) -> Self {
yaml
}
fn is_array(&self) -> bool {
matches!(self, Yaml::Array(_))
}
fn is_hash(&self) -> bool {
matches!(self, Yaml::Hash(_))
}
fn is_badvalue(&self) -> bool {
matches!(self, Yaml::BadValue)
}
fn array_mut(&mut self) -> &mut Vec<Self> {
if let Yaml::Array(x) = self {
x
} else {
panic!("Called array_mut on a non-array");
}
}
fn hash_mut(&mut self) -> &mut LinkedHashMap<Self, Self> {
if let Yaml::Hash(x) = self {
x
} else {
panic!("Called hash_mut on a non-hash");
}
}
fn take(&mut self) -> Self {
let mut taken_out = Yaml::BadValue;
std::mem::swap(&mut taken_out, self);
taken_out
}
}
// parse f64 as Core schema
// See: https://github.com/chyh1990/yaml-rust/issues/51
pub(crate) fn parse_f64(v: &str) -> Option<f64> {
match v {
".inf" | ".Inf" | ".INF" | "+.inf" | "+.Inf" | "+.INF" => Some(f64::INFINITY),
"-.inf" | "-.Inf" | "-.INF" => Some(f64::NEG_INFINITY),
".nan" | "NaN" | ".NAN" => Some(f64::NAN),
_ => v.parse::<f64>().ok(),
}
}

85
saphyr/src/macros.rs Normal file
View file

@ -0,0 +1,85 @@
/// Generate `as_TYPE` methods for the [`crate::Yaml`] enum.
macro_rules! define_as (
($fn_name:ident, $t:ident, $variant:ident) => (
/// Get a copy of the inner object in the YAML enum if it is a `$t`.
///
/// # Return
/// If the variant of `self` is `Self::$variant`, return `Some($t)` with a copy of the `$t` contained.
/// Otherwise, return `None`.
#[must_use]
pub fn $fn_name(&self) -> Option<$t> {
match *self {
Self::$variant(v) => Some(v),
_ => None
}
}
);
);
/// Generate `as_TYPE` methods for the [`crate::Yaml`] enum, returning references.
macro_rules! define_as_ref (
($fn_name:ident, $t:ty, $variant:ident) => (
/// Get a reference to the inner object in the YAML enum if it is a `$t`.
///
/// # Return
/// If the variant of `self` is `Self::$variant`, return `Some(&$t)` with the `$t` contained. Otherwise,
/// return `None`.
#[must_use]
pub fn $fn_name(&self) -> Option<$t> {
match *self {
Self::$variant(ref v) => Some(v),
_ => None
}
}
);
);
/// Generate `as_TYPE` methods for the [`crate::Yaml`] enum, returning mutable references.
macro_rules! define_as_mut_ref (
($fn_name:ident, $t:ty, $variant:ident) => (
/// Get a mutable reference to the inner object in the YAML enum if it is a `$t`.
///
/// # Return
/// If the variant of `self` is `Self::$variant`, return `Some(&mut $t)` with the `$t` contained.
/// Otherwise, return `None`.
#[must_use]
pub fn $fn_name(&mut self) -> Option<$t> {
match *self {
Self::$variant(ref mut v) => Some(v),
_ => None
}
}
);
);
/// Generate `into_TYPE` methods for the [`crate::Yaml`] enum.
macro_rules! define_into (
($fn_name:ident, $t:ty, $variant:ident) => (
/// Get the inner object in the YAML enum if it is a `$t`.
///
/// # Return
/// If the variant of `self` is `Self::$variant`, return `Some($t)` with the `$t` contained. Otherwise,
/// return `None`.
#[must_use]
pub fn $fn_name(self) -> Option<$t> {
match self {
Self::$variant(v) => Some(v),
_ => None
}
}
);
);
/// Generate `is_TYPE` methods for the [`crate::Yaml`] enum.
macro_rules! define_is (
($fn_name:ident, $variant:pat) => (
/// Check whether the YAML enum contains the given variant.
///
/// # Return
/// If the variant of `self` is `Self::$variant`, return `true`. Otherwise, return `False`.
#[must_use]
pub fn $fn_name(&self) -> bool {
matches!(self, $variant)
}
);
);

334
saphyr/src/yaml.rs Normal file
View file

@ -0,0 +1,334 @@
//! YAML objects manipulation utilities.
#![allow(clippy::module_name_repetitions)]
use std::{convert::TryFrom, ops::Index, ops::IndexMut};
use hashlink::LinkedHashMap;
use saphyr_parser::{Parser, ScanError};
use crate::{loader::parse_f64, YamlLoader};
/// A YAML node is stored as this `Yaml` enumeration, which provides an easy way to
/// access your YAML document.
///
/// # Examples
///
/// ```
/// use saphyr::Yaml;
/// let foo = Yaml::from_str("-123"); // convert the string to the appropriate YAML type
/// assert_eq!(foo.as_i64().unwrap(), -123);
///
/// // iterate over an Array
/// let vec = Yaml::Array(vec![Yaml::Integer(1), Yaml::Integer(2)]);
/// for v in vec.as_vec().unwrap() {
/// assert!(v.as_i64().is_some());
/// }
/// ```
#[derive(Clone, PartialEq, PartialOrd, Debug, Eq, Ord, Hash)]
pub enum Yaml {
/// Float types are stored as String and parsed on demand.
/// Note that `f64` does NOT implement Eq trait and can NOT be stored in `BTreeMap`.
Real(String),
/// YAML int is stored as i64.
Integer(i64),
/// YAML scalar.
String(String),
/// YAML bool, e.g. `true` or `false`.
Boolean(bool),
/// YAML array, can be accessed as a `Vec`.
Array(Array),
/// YAML hash, can be accessed as a `LinkedHashMap`.
///
/// Insertion order will match the order of insertion into the map.
Hash(Hash),
/// Alias, not fully supported yet.
Alias(usize),
/// YAML null, e.g. `null` or `~`.
Null,
/// Accessing a nonexistent node via the Index trait returns `BadValue`. This
/// simplifies error handling in the calling code. Invalid type conversion also
/// returns `BadValue`.
BadValue,
}
/// The type contained in the `Yaml::Array` variant. This corresponds to YAML sequences.
pub type Array = Vec<Yaml>;
/// The type contained in the `Yaml::Hash` variant. This corresponds to YAML mappings.
pub type Hash = LinkedHashMap<Yaml, Yaml>;
impl Yaml {
/// Load the given string as an array of YAML documents.
///
/// The `source` is interpreted as YAML documents and is parsed. Parsing succeeds if and only
/// if all documents are parsed successfully. An error in a latter document prevents the former
/// from being returned.
///
/// Most often, only one document is loaded in a YAML string. In this case, only the first element
/// of the returned `Vec` will be used. Otherwise, each element in the `Vec` is a document:
///
/// ```
/// use saphyr::Yaml;
///
/// let docs = Yaml::load_from_str(r#"
/// First document
/// ---
/// - Second document
/// "#).unwrap();
/// let first_document = &docs[0]; // Select the first YAML document
/// // The document is a string containing "First document".
/// assert_eq!(*first_document, Yaml::String("First document".to_owned()));
///
/// let second_document = &docs[1]; // Select the second YAML document
/// // The document is an array containing a single string, "Second document".
/// assert_eq!(second_document[0], Yaml::String("Second document".to_owned()));
/// ```
///
/// # Errors
/// Returns `ScanError` when loading fails.
pub fn load_from_str(source: &str) -> Result<Vec<Self>, ScanError> {
Self::load_from_iter(source.chars())
}
/// Load the contents of the given iterator as an array of YAML documents.
///
/// See [`Self::load_from_str`] for details.
///
/// # Errors
/// Returns `ScanError` when loading fails.
pub fn load_from_iter<I: Iterator<Item = char>>(source: I) -> Result<Vec<Yaml>, ScanError> {
let mut parser = Parser::new(source);
Self::load_from_parser(&mut parser)
}
/// Load the contents from the specified [`Parser`] as an array of YAML documents.
///
/// See [`Self::load_from_str`] for details.
///
/// # Errors
/// Returns `ScanError` when loading fails.
pub fn load_from_parser<I: Iterator<Item = char>>(
parser: &mut Parser<I>,
) -> Result<Vec<Yaml>, ScanError> {
let mut loader = YamlLoader::default();
parser.load(&mut loader, true)?;
Ok(loader.into_documents())
}
define_as!(as_bool, bool, Boolean);
define_as!(as_i64, i64, Integer);
define_as_ref!(as_hash, &Hash, Hash);
define_as_ref!(as_str, &str, String);
define_as_ref!(as_vec, &Array, Array);
define_as_mut_ref!(as_mut_hash, &mut Hash, Hash);
define_as_mut_ref!(as_mut_vec, &mut Array, Array);
define_into!(into_bool, bool, Boolean);
define_into!(into_hash, Hash, Hash);
define_into!(into_i64, i64, Integer);
define_into!(into_string, String, String);
define_into!(into_vec, Array, Array);
define_is!(is_alias, Self::Alias(_));
define_is!(is_array, Self::Array(_));
define_is!(is_badvalue, Self::BadValue);
define_is!(is_boolean, Self::Boolean(_));
define_is!(is_hash, Self::Hash(_));
define_is!(is_integer, Self::Integer(_));
define_is!(is_null, Self::Null);
define_is!(is_real, Self::Real(_));
define_is!(is_string, Self::String(_));
/// Return the `f64` value contained in this YAML node.
///
/// If the node is not a [`Yaml::Real`] YAML node or its contents is not a valid `f64` string,
/// `None` is returned.
#[must_use]
pub fn as_f64(&self) -> Option<f64> {
if let Yaml::Real(ref v) = self {
parse_f64(v)
} else {
None
}
}
/// Return the `f64` value contained in this YAML node.
///
/// If the node is not a [`Yaml::Real`] YAML node or its contents is not a valid `f64` string,
/// `None` is returned.
#[must_use]
pub fn into_f64(self) -> Option<f64> {
self.as_f64()
}
/// If a value is null or otherwise bad (see variants), consume it and
/// replace it with a given value `other`. Otherwise, return self unchanged.
///
/// ```
/// use saphyr::Yaml;
///
/// assert_eq!(Yaml::BadValue.or(Yaml::Integer(3)), Yaml::Integer(3));
/// assert_eq!(Yaml::Integer(3).or(Yaml::BadValue), Yaml::Integer(3));
/// ```
#[must_use]
pub fn or(self, other: Self) -> Self {
match self {
Yaml::BadValue | Yaml::Null => other,
this => this,
}
}
/// See [`Self::or`] for behavior.
///
/// This performs the same operations, but with borrowed values for less linear pipelines.
#[must_use]
pub fn borrowed_or<'a>(&'a self, other: &'a Self) -> &'a Self {
match self {
Yaml::BadValue | Yaml::Null => other,
this => this,
}
}
}
#[allow(clippy::should_implement_trait)]
impl Yaml {
/// Convert a string to a [`Yaml`] node.
///
/// [`Yaml`] does not implement [`std::str::FromStr`] since conversion may not fail. This
/// function falls back to [`Yaml::String`] if nothing else matches.
///
/// # Examples
/// ```
/// # use saphyr::Yaml;
/// assert!(matches!(Yaml::from_str("42"), Yaml::Integer(42)));
/// assert!(matches!(Yaml::from_str("0x2A"), Yaml::Integer(42)));
/// assert!(matches!(Yaml::from_str("0o52"), Yaml::Integer(42)));
/// assert!(matches!(Yaml::from_str("~"), Yaml::Null));
/// assert!(matches!(Yaml::from_str("null"), Yaml::Null));
/// assert!(matches!(Yaml::from_str("true"), Yaml::Boolean(true)));
/// assert!(matches!(Yaml::from_str("3.14"), Yaml::Real(_)));
/// assert!(matches!(Yaml::from_str("foo"), Yaml::String(_)));
/// ```
#[must_use]
pub fn from_str(v: &str) -> Yaml {
if let Some(number) = v.strip_prefix("0x") {
if let Ok(i) = i64::from_str_radix(number, 16) {
return Yaml::Integer(i);
}
} else if let Some(number) = v.strip_prefix("0o") {
if let Ok(i) = i64::from_str_radix(number, 8) {
return Yaml::Integer(i);
}
} else if let Some(number) = v.strip_prefix('+') {
if let Ok(i) = number.parse::<i64>() {
return Yaml::Integer(i);
}
}
match v {
"~" | "null" => Yaml::Null,
"true" => Yaml::Boolean(true),
"false" => Yaml::Boolean(false),
_ => {
if let Ok(integer) = v.parse::<i64>() {
Yaml::Integer(integer)
} else if parse_f64(v).is_some() {
Yaml::Real(v.to_owned())
} else {
Yaml::String(v.to_owned())
}
}
}
}
}
static BAD_VALUE: Yaml = Yaml::BadValue;
impl<'a> Index<&'a str> for Yaml {
type Output = Yaml;
fn index(&self, idx: &'a str) -> &Yaml {
let key = Yaml::String(idx.to_owned());
match self.as_hash() {
Some(h) => h.get(&key).unwrap_or(&BAD_VALUE),
None => &BAD_VALUE,
}
}
}
impl<'a> IndexMut<&'a str> for Yaml {
/// Perform indexing if `self` is a mapping.
///
/// # Panics
/// This function panics if the key given does not exist within `self` (as per [`Index`]).
///
/// This function also panics if `self` is not a [`Yaml::Hash`].
fn index_mut(&mut self, idx: &'a str) -> &mut Yaml {
let key = Yaml::String(idx.to_owned());
match self.as_mut_hash() {
Some(h) => h.get_mut(&key).unwrap(),
None => panic!("Not a hash type"),
}
}
}
impl Index<usize> for Yaml {
type Output = Yaml;
fn index(&self, idx: usize) -> &Yaml {
if let Some(v) = self.as_vec() {
v.get(idx).unwrap_or(&BAD_VALUE)
} else if let Some(v) = self.as_hash() {
let key = Yaml::Integer(i64::try_from(idx).unwrap());
v.get(&key).unwrap_or(&BAD_VALUE)
} else {
&BAD_VALUE
}
}
}
impl IndexMut<usize> for Yaml {
/// Perform indexing if `self` is a sequence or a mapping.
///
/// # Panics
/// This function panics if the index given is out of range (as per [`IndexMut`]). If `self` is
/// a [`Yaml::Array`], this is when the index is bigger or equal to the length of the
/// underlying `Vec`. If `self` is a [`Yaml::Hash`], this is when the mapping sequence does not
/// contain [`Yaml::Integer`]`(idx)` as a key.
///
/// This function also panics if `self` is not a [`Yaml::Array`] nor a [`Yaml::Hash`].
fn index_mut(&mut self, idx: usize) -> &mut Yaml {
match self {
Yaml::Array(sequence) => sequence.index_mut(idx),
Yaml::Hash(mapping) => {
let key = Yaml::Integer(i64::try_from(idx).unwrap());
mapping.get_mut(&key).unwrap()
}
_ => panic!("Attempting to index but `self` is not a sequence nor a mapping"),
}
}
}
impl IntoIterator for Yaml {
type Item = Yaml;
type IntoIter = YamlIter;
fn into_iter(self) -> Self::IntoIter {
YamlIter {
yaml: self.into_vec().unwrap_or_default().into_iter(),
}
}
}
/// An iterator over a [`Yaml`] node.
pub struct YamlIter {
yaml: std::vec::IntoIter<Yaml>,
}
impl Iterator for YamlIter {
type Item = Yaml;
fn next(&mut self) -> Option<Yaml> {
self.yaml.next()
}
}

235
saphyr/tests/basic.rs Normal file
View file

@ -0,0 +1,235 @@
#![allow(clippy::bool_assert_comparison)]
#![allow(clippy::float_cmp)]
use saphyr::{Yaml, YamlEmitter};
#[test]
fn test_api() {
let s = "
# from yaml-cpp example
- name: Ogre
position: [0, 5, 0]
powers:
- name: Club
damage: 10
- name: Fist
damage: 8
- name: Dragon
position: [1, 0, 10]
powers:
- name: Fire Breath
damage: 25
- name: Claws
damage: 15
- name: Wizard
position: [5, -3, 0]
powers:
- name: Acid Rain
damage: 50
- name: Staff
damage: 3
";
let docs = Yaml::load_from_str(s).unwrap();
let doc = &docs[0];
assert_eq!(doc[0]["name"].as_str().unwrap(), "Ogre");
let mut writer = String::new();
{
let mut emitter = YamlEmitter::new(&mut writer);
emitter.dump(doc).unwrap();
}
assert!(!writer.is_empty());
}
#[test]
fn test_coerce() {
let s = "---
a: 1
b: 2.2
c: [1, 2]
";
let out = Yaml::load_from_str(s).unwrap();
let doc = &out[0];
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
#[test]
fn test_anchor() {
let s = "
a1: &DEFAULT
b1: 4
b2: d
a2: *DEFAULT
";
let out = Yaml::load_from_str(s).unwrap();
let doc = &out[0];
assert_eq!(doc["a2"]["b1"].as_i64().unwrap(), 4);
}
#[test]
fn test_bad_anchor() {
let s = "
a1: &DEFAULT
b1: 4
b2: *DEFAULT
";
let out = Yaml::load_from_str(s).unwrap();
let doc = &out[0];
assert_eq!(doc["a1"]["b2"], Yaml::BadValue);
}
#[test]
fn test_plain_datatype() {
let s = "
- 'string'
- \"string\"
- string
- 123
- -321
- 1.23
- -1e4
- ~
- null
- true
- false
- !!str 0
- !!int 100
- !!float 2
- !!null ~
- !!bool true
- !!bool false
- 0xFF
# bad values
- !!int string
- !!float string
- !!bool null
- !!null val
- 0o77
- [ 0xF, 0xF ]
- +12345
- [ true, false ]
";
let out = Yaml::load_from_str(s).unwrap();
let doc = &out[0];
assert_eq!(doc[0].as_str().unwrap(), "string");
assert_eq!(doc[1].as_str().unwrap(), "string");
assert_eq!(doc[2].as_str().unwrap(), "string");
assert_eq!(doc[3].as_i64().unwrap(), 123);
assert_eq!(doc[4].as_i64().unwrap(), -321);
assert_eq!(doc[5].as_f64().unwrap(), 1.23);
assert_eq!(doc[6].as_f64().unwrap(), -1e4);
assert!(doc[7].is_null());
assert!(doc[8].is_null());
assert_eq!(doc[9].as_bool().unwrap(), true);
assert_eq!(doc[10].as_bool().unwrap(), false);
assert_eq!(doc[11].as_str().unwrap(), "0");
assert_eq!(doc[12].as_i64().unwrap(), 100);
assert_eq!(doc[13].as_f64().unwrap(), 2.0);
assert!(doc[14].is_null());
assert_eq!(doc[15].as_bool().unwrap(), true);
assert_eq!(doc[16].as_bool().unwrap(), false);
assert_eq!(doc[17].as_i64().unwrap(), 255);
assert!(doc[18].is_badvalue());
assert!(doc[19].is_badvalue());
assert!(doc[20].is_badvalue());
assert!(doc[21].is_badvalue());
assert_eq!(doc[22].as_i64().unwrap(), 63);
assert_eq!(doc[23][0].as_i64().unwrap(), 15);
assert_eq!(doc[23][1].as_i64().unwrap(), 15);
assert_eq!(doc[24].as_i64().unwrap(), 12345);
assert!(doc[25][0].as_bool().unwrap());
assert!(!doc[25][1].as_bool().unwrap());
}
#[test]
fn test_plain_datatype_with_into_methods() {
let s = "
- 'string'
- \"string\"
- string
- 123
- -321
- 1.23
- -1e4
- true
- false
- !!str 0
- !!int 100
- !!float 2
- !!bool true
- !!bool false
- 0xFF
- 0o77
- +12345
- -.INF
- .NAN
- !!float .INF
";
let mut out = Yaml::load_from_str(s).unwrap().into_iter();
let mut doc = out.next().unwrap().into_iter();
assert_eq!(doc.next().unwrap().into_string().unwrap(), "string");
assert_eq!(doc.next().unwrap().into_string().unwrap(), "string");
assert_eq!(doc.next().unwrap().into_string().unwrap(), "string");
assert_eq!(doc.next().unwrap().into_i64().unwrap(), 123);
assert_eq!(doc.next().unwrap().into_i64().unwrap(), -321);
assert_eq!(doc.next().unwrap().into_f64().unwrap(), 1.23);
assert_eq!(doc.next().unwrap().into_f64().unwrap(), -1e4);
assert_eq!(doc.next().unwrap().into_bool().unwrap(), true);
assert_eq!(doc.next().unwrap().into_bool().unwrap(), false);
assert_eq!(doc.next().unwrap().into_string().unwrap(), "0");
assert_eq!(doc.next().unwrap().into_i64().unwrap(), 100);
assert_eq!(doc.next().unwrap().into_f64().unwrap(), 2.0);
assert_eq!(doc.next().unwrap().into_bool().unwrap(), true);
assert_eq!(doc.next().unwrap().into_bool().unwrap(), false);
assert_eq!(doc.next().unwrap().into_i64().unwrap(), 255);
assert_eq!(doc.next().unwrap().into_i64().unwrap(), 63);
assert_eq!(doc.next().unwrap().into_i64().unwrap(), 12345);
assert_eq!(doc.next().unwrap().into_f64().unwrap(), f64::NEG_INFINITY);
assert!(doc.next().unwrap().into_f64().is_some());
assert_eq!(doc.next().unwrap().into_f64().unwrap(), f64::INFINITY);
}
#[test]
fn test_hash_order() {
let s = "---
b: ~
a: ~
c: ~
";
let out = Yaml::load_from_str(s).unwrap();
let first = out.into_iter().next().unwrap();
let mut iter = first.into_hash().unwrap().into_iter();
assert_eq!(
Some((Yaml::String("b".to_owned()), Yaml::Null)),
iter.next()
);
assert_eq!(
Some((Yaml::String("a".to_owned()), Yaml::Null)),
iter.next()
);
assert_eq!(
Some((Yaml::String("c".to_owned()), Yaml::Null)),
iter.next()
);
assert_eq!(None, iter.next());
}
#[test]
fn test_integer_key() {
let s = "
0:
important: true
1:
important: false
";
let out = Yaml::load_from_str(s).unwrap();
let first = out.into_iter().next().unwrap();
assert_eq!(first[0]["important"].as_bool().unwrap(), true);
}

294
saphyr/tests/emitter.rs Normal file
View file

@ -0,0 +1,294 @@
use saphyr::{Yaml, YamlEmitter};
#[allow(clippy::similar_names)]
#[test]
fn test_emit_simple() {
let s = "
# comment
a0 bb: val
a1:
b1: 4
b2: d
a2: 4 # i'm comment
a3: [1, 2, 3]
a4:
- [a1, a2]
- 2
";
let docs = Yaml::load_from_str(s).unwrap();
let doc = &docs[0];
let mut writer = String::new();
{
let mut emitter = YamlEmitter::new(&mut writer);
emitter.dump(doc).unwrap();
}
println!("original:\n{s}");
println!("emitted:\n{writer}");
let docs_new = match Yaml::load_from_str(&writer) {
Ok(y) => y,
Err(e) => panic!("{}", e),
};
let doc_new = &docs_new[0];
assert_eq!(doc, doc_new);
}
#[test]
fn test_emit_complex() {
let s = r"
catalogue:
product: &coffee { name: Coffee, price: 2.5 , unit: 1l }
product: &cookies { name: Cookies!, price: 3.40 , unit: 400g}
products:
*coffee :
amount: 4
*cookies :
amount: 4
[1,2,3,4]:
array key
2.4:
real key
true:
bool key
{}:
empty hash key
";
let docs = Yaml::load_from_str(s).unwrap();
let doc = &docs[0];
let mut writer = String::new();
{
let mut emitter = YamlEmitter::new(&mut writer);
emitter.dump(doc).unwrap();
}
let docs_new = match Yaml::load_from_str(&writer) {
Ok(y) => y,
Err(e) => panic!("{}", e),
};
let new_doc = &docs_new[0];
assert_eq!(doc, new_doc);
}
#[test]
fn test_emit_avoid_quotes() {
let s = r#"---
a7:
boolean: "true"
boolean2: "false"
date: 2014-12-31
empty_string: ""
empty_string1: " "
empty_string2: " a"
empty_string3: " a "
exp: "12e7"
field: ":"
field2: "{"
field3: "\\"
field4: "\n"
field5: "can't avoid quote"
float: "2.6"
int: "4"
nullable: "null"
nullable2: "~"
products:
"*coffee":
amount: 4
"*cookies":
amount: 4
".milk":
amount: 1
"2.4": real key
"[1,2,3,4]": array key
"true": bool key
"{}": empty hash key
x: test
y: avoid quoting here
z: string with spaces"#;
let docs = Yaml::load_from_str(s).unwrap();
let doc = &docs[0];
let mut writer = String::new();
{
let mut emitter = YamlEmitter::new(&mut writer);
emitter.dump(doc).unwrap();
}
assert_eq!(s, writer, "actual:\n\n{writer}\n");
}
#[test]
fn emit_quoted_bools() {
let input = r#"---
string0: yes
string1: no
string2: "true"
string3: "false"
string4: "~"
null0: ~
[true, false]: real_bools
[True, TRUE, False, FALSE, y,Y,yes,Yes,YES,n,N,no,No,NO,on,On,ON,off,Off,OFF]: false_bools
bool0: true
bool1: false"#;
let expected = r#"---
string0: "yes"
string1: "no"
string2: "true"
string3: "false"
string4: "~"
null0: ~
? - true
- false
: real_bools
? - "True"
- "TRUE"
- "False"
- "FALSE"
- y
- Y
- "yes"
- "Yes"
- "YES"
- n
- N
- "no"
- "No"
- "NO"
- "on"
- "On"
- "ON"
- "off"
- "Off"
- "OFF"
: false_bools
bool0: true
bool1: false"#;
let docs = Yaml::load_from_str(input).unwrap();
let doc = &docs[0];
let mut writer = String::new();
{
let mut emitter = YamlEmitter::new(&mut writer);
emitter.dump(doc).unwrap();
}
assert_eq!(
expected, writer,
"expected:\n{expected}\nactual:\n{writer}\n",
);
}
#[test]
fn test_empty_and_nested() {
test_empty_and_nested_flag(false);
}
#[test]
fn test_empty_and_nested_compact() {
test_empty_and_nested_flag(true);
}
fn test_empty_and_nested_flag(compact: bool) {
let s = if compact {
r"---
a:
b:
c: hello
d: {}
e:
- f
- g
- h: []"
} else {
r"---
a:
b:
c: hello
d: {}
e:
- f
- g
-
h: []"
};
let docs = Yaml::load_from_str(s).unwrap();
let doc = &docs[0];
let mut writer = String::new();
{
let mut emitter = YamlEmitter::new(&mut writer);
emitter.compact(compact);
emitter.dump(doc).unwrap();
}
assert_eq!(s, writer);
}
#[test]
fn test_nested_arrays() {
let s = r"---
a:
- b
- - c
- d
- - e
- f";
let docs = Yaml::load_from_str(s).unwrap();
let doc = &docs[0];
let mut writer = String::new();
{
let mut emitter = YamlEmitter::new(&mut writer);
emitter.dump(doc).unwrap();
}
println!("original:\n{s}");
println!("emitted:\n{writer}");
assert_eq!(s, writer);
}
#[test]
fn test_deeply_nested_arrays() {
let s = r"---
a:
- b
- - c
- d
- - e
- - f
- - e";
let docs = Yaml::load_from_str(s).unwrap();
let doc = &docs[0];
let mut writer = String::new();
{
let mut emitter = YamlEmitter::new(&mut writer);
emitter.dump(doc).unwrap();
}
println!("original:\n{s}");
println!("emitted:\n{writer}");
assert_eq!(s, writer);
}
#[test]
fn test_nested_hashes() {
let s = r"---
a:
b:
c:
d:
e: f";
let docs = Yaml::load_from_str(s).unwrap();
let doc = &docs[0];
let mut writer = String::new();
{
let mut emitter = YamlEmitter::new(&mut writer);
emitter.dump(doc).unwrap();
}
println!("original:\n{s}");
println!("emitted:\n{writer}");
assert_eq!(s, writer);
}

View file

@ -0,0 +1,21 @@
#[macro_use]
extern crate quickcheck;
use quickcheck::TestResult;
use saphyr::{Yaml, YamlEmitter};
quickcheck! {
fn test_check_weird_keys(xs: Vec<String>) -> TestResult {
let mut out_str = String::new();
let input = Yaml::Array(xs.into_iter().map(Yaml::String).collect());
{
let mut emitter = YamlEmitter::new(&mut out_str);
emitter.dump(&input).unwrap();
}
match Yaml::load_from_str(&out_str) {
Ok(output) => TestResult::from_bool(output.len() == 1 && input == output[0]),
Err(err) => TestResult::error(err.to_string()),
}
}
}

57
saphyr/tests/spec_test.rs Normal file
View file

@ -0,0 +1,57 @@
use saphyr::{Hash, Yaml, YamlEmitter};
#[test]
fn test_mapvec_legal() {
// Emitting a `map<map<seq<_>>, _>` should result in legal yaml that
// we can parse.
let key = vec![Yaml::Integer(1), Yaml::Integer(2), Yaml::Integer(3)];
let mut keyhash = Hash::new();
keyhash.insert(Yaml::String("key".into()), Yaml::Array(key));
let val = vec![Yaml::Integer(4), Yaml::Integer(5), Yaml::Integer(6)];
let mut hash = Hash::new();
hash.insert(Yaml::Hash(keyhash), Yaml::Array(val));
let mut out_str = String::new();
{
let mut emitter = YamlEmitter::new(&mut out_str);
emitter.dump(&Yaml::Hash(hash)).unwrap();
}
// At this point, we are tempted to naively render like this:
//
// ```yaml
// ---
// {key:
// - 1
// - 2
// - 3}:
// - 4
// - 5
// - 6
// ```
//
// However, this doesn't work, because the key sequence [1, 2, 3] is
// rendered in block mode, which is not legal (as far as I can tell)
// inside the flow mode of the key. We need to either fully render
// everything that's in a key in flow mode (which may make for some
// long lines), or use the explicit map identifier '?':
//
// ```yaml
// ---
// ?
// key:
// - 1
// - 2
// - 3
// :
// - 4
// - 5
// - 6
// ```
Yaml::load_from_str(&out_str).unwrap();
}

1513
saphyr/tests/spec_test.rs.inc generated Normal file

File diff suppressed because it is too large Load diff

337
saphyr/tests/specexamples.rs.inc generated Normal file
View file

@ -0,0 +1,337 @@
const EX2_1 : &str =
"- Mark McGwire\n- Sammy Sosa\n- Ken Griffey";
const EX2_2 : &str =
"hr: 65 # Home runs\navg: 0.278 # Batting average\nrbi: 147 # Runs Batted In";
const EX2_3 : &str =
"american:\n- Boston Red Sox\n- Detroit Tigers\n- New York Yankees\nnational:\n- New York Mets\n- Chicago Cubs\n- Atlanta Braves";
const EX2_4 : &str =
"-\n name: Mark McGwire\n hr: 65\n avg: 0.278\n-\n name: Sammy Sosa\n hr: 63\n avg: 0.288";
const EX2_5 : &str =
"- [name , hr, avg ]\n- [Mark McGwire, 65, 0.278]\n- [Sammy Sosa , 63, 0.288]";
const EX2_6 : &str =
"Mark McGwire: {hr: 65, avg: 0.278}\nSammy Sosa: {\n hr: 63,\n avg: 0.288\n }";
const EX2_7 : &str =
"# Ranking of 1998 home runs\n---\n- Mark McGwire\n- Sammy Sosa\n- Ken Griffey\n\n# Team ranking\n---\n- Chicago Cubs\n- St Louis Cardinals";
const EX2_8 : &str =
"---\ntime: 20:03:20\nplayer: Sammy Sosa\naction: strike (miss)\n...\n---\ntime: 20:03:47\nplayer: Sammy Sosa\naction: grand slam\n...";
const EX2_9 : &str =
"---\nhr: # 1998 hr ranking\n - Mark McGwire\n - Sammy Sosa\nrbi:\n # 1998 rbi ranking\n - Sammy Sosa\n - Ken Griffey";
const EX2_10 : &str =
"---\nhr:\n - Mark McGwire\n # Following node labeled SS\n - &SS Sammy Sosa\nrbi:\n - *SS # Subsequent occurrence\n - Ken Griffey";
const EX2_11 : &str =
"? - Detroit Tigers\n - Chicago cubs\n:\n - 2001-07-23\n\n? [ New York Yankees,\n Atlanta Braves ]\n: [ 2001-07-02, 2001-08-12,\n 2001-08-14 ]";
const EX2_12 : &str =
"---\n# Products purchased\n- item : Super Hoop\n quantity: 1\n- item : Basketball\n quantity: 4\n- item : Big Shoes\n quantity: 1";
const EX2_13 : &str =
"# ASCII Art\n--- |\n \\//||\\/||\n // || ||__";
const EX2_14 : &str =
"--- >\n Mark McGwire's\n year was crippled\n by a knee injury.";
const EX2_15 : &str =
">\n Sammy Sosa completed another\n fine season with great stats.\n \n 63 Home Runs\n 0.288 Batting Average\n \n What a year!";
const EX2_16 : &str =
"name: Mark McGwire\naccomplishment: >\n Mark set a major league\n home run record in 1998.\nstats: |\n 65 Home Runs\n 0.278 Batting Average\n";
const EX2_17 : &str =
"unicode: \"Sosa did fine.\\u263A\"\ncontrol: \"\\b1998\\t1999\\t2000\\n\"\nhex esc: \"\\x0d\\x0a is \\r\\n\"\n\nsingle: '\"Howdy!\" he cried.'\nquoted: ' # Not a ''comment''.'\ntie-fighter: '|\\-*-/|'";
const EX2_18 : &str =
"plain:\n This unquoted scalar\n spans many lines.\n\nquoted: \"So does this\n quoted scalar.\\n\"";
// TODO: 2.19 - 2.22 schema tags
const EX2_23 : &str =
"---\nnot-date: !!str 2002-04-28\n\npicture: !!binary |\n R0lGODlhDAAMAIQAAP//9/X\n 17unp5WZmZgAAAOfn515eXv\n Pz7Y6OjuDg4J+fn5OTk6enp\n 56enmleECcgggoBADs=\n\napplication specific tag: !something |\n The semantics of the tag\n above may be different for\n different documents.";
const EX2_24 : &str =
"%TAG ! tag:clarkevans.com,2002:\n--- !shape\n # Use the ! handle for presenting\n # tag:clarkevans.com,2002:circle\n- !circle\n center: &ORIGIN {x: 73, y: 129}\n radius: 7\n- !line\n start: *ORIGIN\n finish: { x: 89, y: 102 }\n- !label\n start: *ORIGIN\n color: 0xFFEEBB\n text: Pretty vector drawing.";
const EX2_25 : &str =
"# Sets are represented as a\n# Mapping where each key is\n# associated with a null value\n--- !!set\n? Mark McGwire\n? Sammy Sosa\n? Ken Griffey";
const EX2_26 : &str =
"# Ordered maps are represented as\n# A sequence of mappings, with\n# each mapping having one key\n--- !!omap\n- Mark McGwire: 65\n- Sammy Sosa: 63\n- Ken Griffey: 58";
const EX2_27 : &str =
"--- !<tag:clarkevans.com,2002:invoice>\ninvoice: 34843\ndate : 2001-01-23\nbill-to: &id001\n given : Chris\n family : Dumars\n address:\n lines: |\n 458 Walkman Dr.\n Suite #292\n city : Royal Oak\n state : MI\n postal : 48046\nship-to: *id001\nproduct:\n - sku : BL394D\n quantity : 4\n description : Basketball\n price : 450.00\n - sku : BL4438H\n quantity : 1\n description : Super Hoop\n price : 2392.00\ntax : 251.42\ntotal: 4443.52\ncomments:\n Late afternoon is best.\n Backup contact is Nancy\n Billsmer @ 338-4338.";
const EX2_28 : &str =
"---\nTime: 2001-11-23 15:01:42 -5\nUser: ed\nWarning:\n This is an error message\n for the log file\n---\nTime: 2001-11-23 15:02:31 -5\nUser: ed\nWarning:\n A slightly different error\n message.\n---\nDate: 2001-11-23 15:03:17 -5\nUser: ed\nFatal:\n Unknown variable \"bar\"\nStack:\n - file: TopClass.py\n line: 23\n code: |\n x = MoreObject(\"345\\n\")\n - file: MoreClass.py\n line: 58\n code: |-\n foo = bar";
// TODO: 5.1 - 5.2 BOM
const EX5_3 : &str =
"sequence:\n- one\n- two\nmapping:\n ? sky\n : blue\n sea : green";
const EX5_4 : &str =
"sequence: [ one, two, ]\nmapping: { sky: blue, sea: green }";
const EX5_5 : &str = "# Comment only.";
const EX5_6 : &str =
"anchored: !local &anchor value\nalias: *anchor";
const EX5_7 : &str =
"literal: |\n some\n text\nfolded: >\n some\n text\n";
const EX5_8 : &str =
"single: 'text'\ndouble: \"text\"";
// TODO: 5.9 directive
// TODO: 5.10 reserved indicator
const EX5_11 : &str =
"|\n Line break (no glyph)\n Line break (glyphed)\n";
const EX5_12 : &str =
"# Tabs and spaces\nquoted: \"Quoted\t\"\nblock: |\n void main() {\n \tprintf(\"Hello, world!\\n\");\n }";
const EX5_13 : &str =
"\"Fun with \\\\\n\\\" \\a \\b \\e \\f \\\n\\n \\r \\t \\v \\0 \\\n\\ \\_ \\N \\L \\P \\\n\\x41 \\u0041 \\U00000041\"";
const EX5_14 : &str =
"Bad escapes:\n \"\\c\n \\xq-\"";
const EX6_1 : &str =
" # Leading comment line spaces are\n # neither content nor indentation.\n \nNot indented:\n By one space: |\n By four\n spaces\n Flow style: [ # Leading spaces\n By two, # in flow style\n Also by two, # are neither\n \tStill by two # content nor\n ] # indentation.";
const EX6_2 : &str =
"? a\n: -\tb\n - -\tc\n - d";
const EX6_3 : &str =
"- foo:\t bar\n- - baz\n -\tbaz";
const EX6_4 : &str =
"plain: text\n lines\nquoted: \"text\n \tlines\"\nblock: |\n text\n \tlines\n";
const EX6_5 : &str =
"Folding:\n \"Empty line\n \t\n as a line feed\"\nChomping: |\n Clipped empty lines\n ";
const EX6_6 : &str =
">-\n trimmed\n \n \n\n as\n space";
const EX6_7 : &str =
">\n foo \n \n \t bar\n\n baz\n";
const EX6_8 : &str =
"\"\n foo \n \n \t bar\n\n baz\n\"";
const EX6_9 : &str =
"key: # Comment\n value";
const EX6_10 : &str =
" # Comment\n \n\n";
const EX6_11 : &str =
"key: # Comment\n # lines\n value\n\n";
const EX6_12 : &str =
"{ first: Sammy, last: Sosa }:\n# Statistics:\n hr: # Home runs\n 65\n avg: # Average\n 0.278";
const EX6_13 : &str =
"%FOO bar baz # Should be ignored\n # with a warning.\n--- \"foo\"";
const EX6_14 : &str =
"%YAML 1.3 # Attempt parsing\n # with a warning\n---\n\"foo\"";
const EX6_15 : &str =
"%YAML 1.2\n%YAML 1.1\nfoo";
const EX6_16 : &str =
"%TAG !yaml! tag:yaml.org,2002:\n---\n!yaml!str \"foo\"";
const EX6_17 : &str =
"%TAG ! !foo\n%TAG ! !foo\nbar";
const EX6_18 : &str =
"# Private\n!foo \"bar\"\n...\n# Global\n%TAG ! tag:example.com,2000:app/\n---\n!foo \"bar\"";
const EX6_19 : &str =
"%TAG !! tag:example.com,2000:app/\n---\n!!int 1 - 3 # Interval, not integer";
const EX6_20 : &str =
"%TAG !e! tag:example.com,2000:app/\n---\n!e!foo \"bar\"";
const EX6_21 : &str =
"%TAG !m! !my-\n--- # Bulb here\n!m!light fluorescent\n...\n%TAG !m! !my-\n--- # Color here\n!m!light green";
const EX6_22 : &str =
"%TAG !e! tag:example.com,2000:app/\n---\n- !e!foo \"bar\"";
const EX6_23 : &str =
"!!str &a1 \"foo\":\n !!str bar\n&a2 baz : *a1";
const EX6_24 : &str =
"!<tag:yaml.org,2002:str> foo :\n !<!bar> baz";
const EX6_25 : &str =
"- !<!> foo\n- !<$:?> bar\n";
const EX6_26 : &str =
"%TAG !e! tag:example.com,2000:app/\n---\n- !local foo\n- !!str bar\n- !e!tag%21 baz\n";
const EX6_27a : &str =
"%TAG !e! tag:example,2000:app/\n---\n- !e! foo";
const EX6_27b : &str =
"%TAG !e! tag:example,2000:app/\n---\n- !h!bar baz";
const EX6_28 : &str =
"# Assuming conventional resolution:\n- \"12\"\n- 12\n- ! 12";
const EX6_29 : &str =
"First occurrence: &anchor Value\nSecond occurrence: *anchor";
const EX7_1 : &str =
"First occurrence: &anchor Foo\nSecond occurrence: *anchor\nOverride anchor: &anchor Bar\nReuse anchor: *anchor";
const EX7_2 : &str =
"{\n foo : !!str,\n !!str : bar,\n}";
const EX7_3 : &str =
"{\n ? foo :,\n : bar,\n}\n";
const EX7_4 : &str =
"\"implicit block key\" : [\n \"implicit flow key\" : value,\n ]";
const EX7_5 : &str =
"\"folded \nto a space,\t\n \nto a line feed, or \t\\\n \\ \tnon-content\"";
const EX7_6 : &str =
"\" 1st non-empty\n\n 2nd non-empty \n\t3rd non-empty \"";
const EX7_7 : &str = " 'here''s to \"quotes\"'";
const EX7_8 : &str =
"'implicit block key' : [\n 'implicit flow key' : value,\n ]";
const EX7_9 : &str =
"' 1st non-empty\n\n 2nd non-empty \n\t3rd non-empty '";
const EX7_10 : &str =
"# Outside flow collection:\n- ::vector\n- \": - ()\"\n- Up, up, and away!\n- -123\n- http://example.com/foo#bar\n# Inside flow collection:\n- [ ::vector,\n \": - ()\",\n \"Up, up, and away!\",\n -123,\n http://example.com/foo#bar ]";
const EX7_11 : &str =
"implicit block key : [\n implicit flow key : value,\n ]";
const EX7_12 : &str =
"1st non-empty\n\n 2nd non-empty \n\t3rd non-empty";
const EX7_13 : &str =
"- [ one, two, ]\n- [three ,four]";
const EX7_14 : &str =
"[\n\"double\n quoted\", 'single\n quoted',\nplain\n text, [ nested ],\nsingle: pair,\n]";
const EX7_15 : &str =
"- { one : two , three: four , }\n- {five: six,seven : eight}";
const EX7_16 : &str =
"{\n? explicit: entry,\nimplicit: entry,\n?\n}";
const EX7_17 : &str =
"{\nunquoted : \"separate\",\nhttp://foo.com,\nomitted value:,\n: omitted key,\n}";
const EX7_18 : &str =
"{\n\"adjacent\":value,\n\"readable\":value,\n\"empty\":\n}";
const EX7_19 : &str =
"[\nfoo: bar\n]";
const EX7_20 : &str =
"[\n? foo\n bar : baz\n]";
const EX7_21 : &str =
"- [ YAML : separate ]\n- [ : empty key entry ]\n- [ {JSON: like}:adjacent ]";
const EX7_22 : &str =
"[ foo\n bar: invalid,"; // Note: we don't check (on purpose) the >1K chars for an
// implicit key
const EX7_23 : &str =
"- [ a, b ]\n- { a: b }\n- \"a\"\n- 'b'\n- c";
const EX7_24 : &str =
"- !!str \"a\"\n- 'b'\n- &anchor \"c\"\n- *anchor\n- !!str";
const EX8_1 : &str =
"- | # Empty header\n literal\n- >1 # Indentation indicator\n folded\n- |+ # Chomping indicator\n keep\n\n- >1- # Both indicators\n strip\n";
const EX8_2 : &str =
"- |\n detected\n- >\n \n \n # detected\n- |1\n explicit\n- >\n \t\n detected\n";
const EX8_3a : &str =
"- |\n \n text";
const EX8_3b : &str =
"- >\n text\n text";
const EX8_3c : &str =
"- |2\n text";
const EX8_4 : &str =
"strip: |-\n text\nclip: |\n text\nkeep: |+\n text\n";
const EX8_5 : &str =
" # Strip\n # Comments:\nstrip: |-\n # text\n \n # Clip\n # comments:\n\nclip: |\n # text\n \n # Keep\n # comments:\n\nkeep: |+\n # text\n\n # Trail\n # Comments\n";
const EX8_6 : &str =
"strip: >-\n\nclip: >\n\nkeep: |+\n\n";
const EX8_7 : &str =
"|\n literal\n \ttext\n\n";
const EX8_8 : &str =
"|\n \n \n literal\n \n \n text\n\n # Comment\n";
const EX8_9 : &str =
">\n folded\n text\n\n";
const EX8_10 : &str =
">\n\n folded\n line\n\n next\n line\n * bullet\n\n * list\n * lines\n\n last\n line\n\n# Comment\n";
const EX8_11 : &str = EX8_10;
const EX8_12 : &str = EX8_10;
const EX8_13 : &str = EX8_10;
const EX8_14 : &str =
"block sequence:\n - one\n - two : three\n";
const EX8_15 : &str =
"- # Empty\n- |\n block node\n- - one # Compact\n - two # sequence\n- one: two # Compact mapping\n";
const EX8_16 : &str =
"block mapping:\n key: value\n";
const EX8_17 : &str =
"? explicit key # Empty value\n? |\n block key\n: - one # Explicit compact\n - two # block value\n";
// XXX libyaml failed this test
const EX8_18 : &str =
"plain key: in-line value\n: # Both empty\n\"quoted key\":\n- entry\n";
const EX8_19 : &str =
"- sun: yellow\n- ? earth: blue\n : moon: white\n";
const EX8_20 : &str =
"-\n \"flow in block\"\n- >\n Block scalar\n- !!map # Block collection\n foo : bar\n";
const EX8_21 : &str =
"literal: |2\n value\nfolded:\n !foo\n >1\n value\n";
const EX8_22 : &str =
"sequence: !!seq\n- entry\n- !!seq\n - nested\nmapping: !!map\n foo: bar\n";

View file

@ -0,0 +1,114 @@
use saphyr::{Yaml, YamlEmitter};
fn roundtrip(original: &Yaml) {
let mut emitted = String::new();
YamlEmitter::new(&mut emitted).dump(original).unwrap();
let documents = Yaml::load_from_str(&emitted).unwrap();
println!("emitted {emitted}");
assert_eq!(documents.len(), 1);
assert_eq!(documents[0], *original);
}
fn roundtrip_multiline(original: &Yaml) {
let mut emitted = String::new();
let mut emitter = YamlEmitter::new(&mut emitted);
emitter.multiline_strings(true);
emitter.dump(original).unwrap();
let documents = Yaml::load_from_str(&emitted).unwrap();
println!("emitted {emitted}");
assert_eq!(documents.len(), 1);
assert_eq!(documents[0], *original);
}
fn double_roundtrip(original: &str) {
let parsed = Yaml::load_from_str(original).unwrap();
let mut serialized = String::new();
YamlEmitter::new(&mut serialized).dump(&parsed[0]).unwrap();
let reparsed = Yaml::load_from_str(&serialized).unwrap();
assert_eq!(parsed, reparsed);
}
#[test]
fn test_escape_character() {
let y = Yaml::String("\x1b".to_owned());
roundtrip(&y);
}
#[test]
fn test_colon_in_string() {
let y = Yaml::String("x: %".to_owned());
roundtrip(&y);
}
#[test]
fn test_numberlike_strings() {
let docs = [
r#"x: "1234""#,
r#"x: "01234""#,
r#""1234""#,
r#""01234""#,
r#"" 01234""#,
r#""0x1234""#,
r#"" 0x1234""#,
];
for doc in &docs {
roundtrip(&Yaml::String((*doc).to_string()));
double_roundtrip(doc);
}
}
/// Example from <https://github.com/chyh1990/yaml-rust/issues/133>
#[test]
fn test_issue133() {
let doc = Yaml::load_from_str("\"0x123\"").unwrap().pop().unwrap();
assert_eq!(doc, Yaml::String("0x123".to_string()));
let mut out_str = String::new();
YamlEmitter::new(&mut out_str).dump(&doc).unwrap();
let doc2 = Yaml::load_from_str(&out_str).unwrap().pop().unwrap();
assert_eq!(doc, doc2); // This failed because the type has changed to a number now
}
#[test]
fn test_newline() {
let y = Yaml::Array(vec![Yaml::String("\n".to_owned())]);
roundtrip(&y);
}
#[test]
fn test_crlf() {
let y = Yaml::Array(vec![Yaml::String("\r\n".to_owned())]);
roundtrip(&y);
}
#[test]
fn test_multiline_noline() {
let y = Yaml::Array(vec![Yaml::String("a".to_owned())]);
roundtrip_multiline(&y);
}
#[test]
fn test_multiline_inner_newline() {
let y = Yaml::Array(vec![Yaml::String("a\nb".to_owned())]);
roundtrip_multiline(&y);
}
#[test]
fn test_multiline_trailing_newline() {
let y = Yaml::Array(vec![Yaml::String("a\n".to_owned())]);
roundtrip_multiline(&y);
}
#[test]
fn test_multiline_leading_newline() {
let y = Yaml::Array(vec![Yaml::String("\na".to_owned())]);
roundtrip_multiline(&y);
}