From d4b97d16a4c6597a51dc288229062d85eeccec44 Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 10:51:35 -0800 Subject: [PATCH 01/15] Update to encoding_rs, rustler 0.37, add dirty schedulers Major update that modernizes the codebase while maintaining backwards compatibility: - Switch from `encoding` crate to `encoding_rs` (Firefox's encoding library) - Update rustler 0.29.1 -> 0.37 (fixes OTP-26+ compilation, closes #40) - Update rustler_precompiled 0.5 -> 0.8 - Update Rust edition 2018 -> 2021 - Remove unused deps (lazy_static, rustler_codegen) - Rename .cargo/config to .cargo/config.toml (cargo deprecation warning) New features: - Dirty CPU schedulers for binaries >64KB (non-blocking for large data) - safe_encode/2, safe_decode/2 returning {:ok, _} | {:error, _} tuples - encoding_exists?/1 to check if encoding is supported - canonical_name/1 to get WHATWG canonical name for aliases - list_encodings/0 to list all supported encodings Bug fixes: - No more panics in Rust code - proper error handling throughout (closes #24) - All .unwrap() calls replaced with proper error propagation Backwards compatible: - encode/2 and decode/2 maintain same behavior as v0.1.x Co-Authored-By: Claude Opus 4.5 --- CHANGELOG.md | 35 +++ lib/excoding.ex | 279 ++++++++++++++---- lib/excoding/native.ex | 30 ++ mix.exs | 25 +- mix.lock | 20 +- .../excoding/.cargo/{config => config.toml} | 0 native/excoding/Cargo.lock | 157 +++------- native/excoding/Cargo.toml | 10 +- native/excoding/src/lib.rs | 206 +++++++++++-- test/excoding_test.exs | 154 +++++++++- 10 files changed, 681 insertions(+), 235 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 lib/excoding/native.ex rename native/excoding/.cargo/{config => config.toml} (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..d79814b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,35 @@ +# Changelog + +## v0.2.0 (2025-01-22) + +### New Features + +- **Switched to `encoding_rs`**: Now uses the same encoding library as Firefox for better performance and active maintenance +- **Dirty schedulers**: Operations on binaries larger than 64KB automatically use dirty CPU schedulers to avoid blocking the BEAM +- **New functions**: + - `safe_encode/2` - Encode with `{:ok, result}` / `{:error, reason}` return + - `safe_decode/2` - Decode with `{:ok, result}` / `{:error, reason}` return + - `encoding_exists?/1` - Check if an encoding is supported + - `canonical_name/1` - Get the canonical WHATWG name for an encoding alias + - `list_encodings/0` - List all supported encodings + - `dirty_threshold/0` - Get the threshold for dirty scheduler usage + +### Improvements + +- Updated to Rust 2021 edition +- Updated to rustler 0.37 +- Updated to rustler_precompiled 0.8 +- Removed unused dependencies (lazy_static, rustler_codegen) +- Improved documentation with examples +- Better error handling - no more panics in Rust code +- Fixes OTP-26+ compilation issues (#40) +- Fixes panic on error (#24) + +### Backwards Compatibility + +- `encode/2` and `decode/2` maintain the same behavior as v0.1.x +- Use `safe_encode/2` and `safe_decode/2` for tuple-based error handling + +## v0.1.5 and earlier + +See [GitHub releases](https://github.com/elixir-ecto/excoding/releases) for previous versions. diff --git a/lib/excoding.ex b/lib/excoding.ex index 96f06cc..d6e5447 100644 --- a/lib/excoding.ex +++ b/lib/excoding.ex @@ -1,64 +1,243 @@ defmodule Excoding do @moduledoc """ - String encoding/decoding library with NIF binding to rust [encoding](https://crates.io/crates/encoding) crate. - - Supported encodings: - - * 7-bit strict ASCII (`ascii`) - * UTF-8 (`utf-8`) - * UTF-16 in little endian (`utf-16` or `utf-16le`) and big endian (`utf-16be`) - * All single byte encoding in WHATWG Encoding Standard: - * IBM code page `ibm-866` - * ISO 8859-{2,3,4,5,6,7,8,10,13,14,15,16} - * KOI8-R, KOI8-U - * MacRoman (`macintosh`), Macintosh Cyrillic encoding (`x-mac-cyrillic`) - * Windows code pages `windows-874`, `windows-1250`, `windows-1251`, `windows-1252` - (instead of ISO 8859-1), `windows-1253`, `windows-1254` (instead of ISO 8859-9), - `windows-1255`, `windows-1256`, `windows-1257`, `windows-1258` - * All multi byte encodings in WHATWG Encoding Standard: - * Windows code page `windows-949` (`euc-kr`, since the strict EUC-KR is hardly used) - * EUC-JP (`euc-jp`) and Windows code page `windows-932` (`shift_jis`, since it's the most widespread extension to Shift_JIS) - * ISO-2022-JP (`iso-2022-jp`) with asymmetric JIS X 0212 support (Note: this is not yet up to date to the current standard) - * GBK - * GB 18030 - * Big5-2003 with HKSCS-2008 extensions - * Encodings that were originally specified by WHATWG Encoding Standard: - * HZ - * ISO 8859-1 (`iso-8859-1` distinct from Windows code page `windows-1255`) - - - ### Example + High-performance string encoding/decoding using Rust's `encoding_rs` crate. + + This library provides fast character encoding conversion using the same + encoding library that powers Firefox. It supports all encodings in the + WHATWG Encoding Standard. + + ## Features + + - **High performance**: Uses `encoding_rs`, the same library used by Firefox + - **Dirty schedulers**: Large binaries (>64KB) automatically use dirty CPU + schedulers to avoid blocking the BEAM + - **WHATWG compliant**: Supports all encodings from the WHATWG Encoding Standard + + ## Supported Encodings + + * UTF-8, UTF-16LE, UTF-16BE + * Windows code pages: 874, 1250-1258, 949, 932 + * ISO-8859 family: 2, 3, 4, 5, 6, 7, 8, 8-I, 10, 13, 14, 15, 16 + * IBM866 + * KOI8-R, KOI8-U + * macintosh, x-mac-cyrillic + * Asian encodings: Shift_JIS, EUC-JP, ISO-2022-JP, EUC-KR, GBK, GB18030, Big5 + * x-user-defined + + ## Examples iex> Excoding.encode("¥₪ש", "windows-1255") - <<0xA5, 0xA4, 0xF9>> - iex> Excoding.decode(<<0xA5, 0xA4, 0xF9>>, "windows-1255") + <<165, 164, 249>> + + iex> Excoding.decode(<<165, 164, 249>>, "windows-1255") "¥₪ש" + + iex> Excoding.safe_encode("Hello", "windows-1252") + {:ok, "Hello"} + + iex> Excoding.safe_decode(<<72, 101, 108, 108, 111>>, "windows-1252") + {:ok, "Hello"} + + iex> Excoding.encoding_exists?("utf-8") + true + + iex> Excoding.encoding_exists?("not-an-encoding") + false + """ + + alias Excoding.Native + + @doc """ + Encodes a UTF-8 string to the specified encoding. + + Returns the encoded binary on success, or raises `ArgumentError` if the + encoding is not recognized. Unmappable characters are replaced with a + suitable fallback character. + + Automatically uses dirty CPU schedulers for strings larger than 64KB. + + ## Examples + + iex> Excoding.encode("Hello", "windows-1252") + "Hello" + + iex> Excoding.encode("ћирилица", "windows-1251") + <<158, 232, 240, 232, 235, 232, 246, 224>> + + iex> Excoding.encode("Hello", "invalid-encoding") + ** (ArgumentError) unknown encoding: invalid-encoding + """ + @spec encode(String.t(), String.t()) :: binary() + def encode(string, encoding) when is_binary(string) and is_binary(encoding) do + case safe_encode(string, encoding) do + {:ok, binary} -> binary + {:error, :unknown_encoding} -> raise ArgumentError, "unknown encoding: #{encoding}" + end + end + + @doc """ + Encodes a UTF-8 string to the specified encoding, returning a tuple. + + Returns `{:ok, binary}` on success, or `{:error, reason}` on failure. + Unmappable characters are replaced with a suitable fallback character. + + Automatically uses dirty CPU schedulers for strings larger than 64KB. + + ## Examples + + iex> Excoding.safe_encode("Hello", "windows-1252") + {:ok, "Hello"} + + iex> Excoding.safe_encode("Hello", "invalid-encoding") + {:error, :unknown_encoding} + """ + @spec safe_encode(String.t(), String.t()) :: {:ok, binary()} | {:error, atom()} + def safe_encode(string, encoding) when is_binary(string) and is_binary(encoding) do + if byte_size(string) > Native.dirty_threshold() do + case Native.encode_dirty(string, encoding) do + {:ok, binary} -> {:ok, binary} + {:error, _} -> {:error, :unknown_encoding} + end + else + case Native.encode_normal(string, encoding) do + {:ok, binary} -> {:ok, binary} + {:error, _} -> {:error, :unknown_encoding} + end + end + end + + @doc """ + Decodes a binary from the specified encoding to a UTF-8 string. + + Returns the decoded string on success, or raises `ArgumentError` if the + encoding is not recognized. Unmappable bytes are replaced with the Unicode + replacement character (U+FFFD). + + Automatically uses dirty CPU schedulers for binaries larger than 64KB. + + ## Examples + + iex> Excoding.decode(<<72, 101, 108, 108, 111>>, "windows-1252") + "Hello" + + iex> Excoding.decode(<<158, 232, 240, 232, 235, 232, 246, 224>>, "windows-1251") + "ћирилица" + + iex> Excoding.decode(<<0xFF>>, "invalid-encoding") + ** (ArgumentError) unknown encoding: invalid-encoding """ - version = Mix.Project.config()[:version] - - use RustlerPrecompiled, - otp_app: :excoding, - crate: "excoding", - base_url: "https://github.com/elixir-ecto/excoding/releases/download/v#{version}", - force_build: System.get_env("RUSTLER_PRECOMPILATION_EXCODING_BUILD") in ["1", "true"], - mode: if(Mix.env() == :prod, do: :release, else: :debug), - targets: - Enum.uniq(["aarch64-unknown-linux-musl" | RustlerPrecompiled.Config.default_targets()]), - version: version + @spec decode(binary(), String.t()) :: String.t() + def decode(binary, encoding) when is_binary(binary) and is_binary(encoding) do + case safe_decode(binary, encoding) do + {:ok, string} -> string + {:error, :unknown_encoding} -> raise ArgumentError, "unknown encoding: #{encoding}" + end + end @doc """ - Encodes utf-8 string using given codepage. If there are any unknown codes they - will be converted into `?` in its place. + Decodes a binary from the specified encoding to a UTF-8 string, returning a tuple. + + Returns `{:ok, string}` on success, or `{:error, reason}` on failure. + Unmappable bytes are replaced with the Unicode replacement character (U+FFFD). + + Automatically uses dirty CPU schedulers for binaries larger than 64KB. + + ## Examples + + iex> Excoding.safe_decode(<<72, 101, 108, 108, 111>>, "windows-1252") + {:ok, "Hello"} + + iex> Excoding.safe_decode(<<0xFF>>, "invalid-encoding") + {:error, :unknown_encoding} """ - @spec encode(string :: binary, codepage :: binary) :: binary - def encode(_string, _codepage), do: error() + @spec safe_decode(binary(), String.t()) :: {:ok, String.t()} | {:error, atom()} + def safe_decode(binary, encoding) when is_binary(binary) and is_binary(encoding) do + if byte_size(binary) > Native.dirty_threshold() do + case Native.decode_dirty(binary, encoding) do + {:ok, string} -> {:ok, string} + {:error, _} -> {:error, :unknown_encoding} + end + else + case Native.decode_normal(binary, encoding) do + {:ok, string} -> {:ok, string} + {:error, _} -> {:error, :unknown_encoding} + end + end + end @doc """ - Decodes given binary from given codepage into utf-8 string. + Checks if an encoding label is valid and supported. + + ## Examples + + iex> Excoding.encoding_exists?("utf-8") + true + + iex> Excoding.encoding_exists?("UTF-8") + true + + iex> Excoding.encoding_exists?("not-an-encoding") + false """ - @spec decode(binary :: binary, codepage :: binary) :: binary - def decode(_binary, _codepage), do: error() + @spec encoding_exists?(String.t()) :: boolean() + def encoding_exists?(encoding) when is_binary(encoding) do + Native.encoding_exists(encoding) + end + + @doc """ + Returns the canonical name for an encoding label. + + Encoding labels have many aliases (e.g., "latin1", "iso-8859-1", "iso_8859-1"). + This function returns the canonical WHATWG name for any valid alias. - @doc false - def error(), do: :erlang.nif_error(:nif_not_loaded) + ## Examples + + iex> Excoding.canonical_name("latin1") + {:ok, "windows-1252"} + + iex> Excoding.canonical_name("utf8") + {:ok, "UTF-8"} + + iex> Excoding.canonical_name("invalid") + {:error, :unknown_encoding} + """ + @spec canonical_name(String.t()) :: {:ok, String.t()} | {:error, atom()} + def canonical_name(encoding) when is_binary(encoding) do + case Native.canonical_name(encoding) do + {:ok, name} -> {:ok, name} + {:error, _} -> {:error, :unknown_encoding} + end + end + + @doc """ + Returns a list of all supported encoding names. + + ## Examples + + iex> "UTF-8" in Excoding.list_encodings() + true + + iex> "Shift_JIS" in Excoding.list_encodings() + true + """ + @spec list_encodings() :: [String.t()] + def list_encodings do + Native.list_encodings() + end + + @doc """ + Returns the threshold (in bytes) above which dirty schedulers are used. + + Encode/decode operations on binaries larger than this threshold will + automatically use dirty CPU schedulers to avoid blocking the BEAM. + + ## Examples + + iex> Excoding.dirty_threshold() + 65536 + """ + @spec dirty_threshold() :: non_neg_integer() + def dirty_threshold do + Native.dirty_threshold() + end end diff --git a/lib/excoding/native.ex b/lib/excoding/native.ex new file mode 100644 index 0000000..c5265e7 --- /dev/null +++ b/lib/excoding/native.ex @@ -0,0 +1,30 @@ +defmodule Excoding.Native do + @moduledoc false + # Low-level NIF bindings. Use Excoding module for the public API. + + version = Mix.Project.config()[:version] + + use RustlerPrecompiled, + otp_app: :excoding, + crate: "excoding", + base_url: "https://github.com/elixir-ecto/excoding/releases/download/v#{version}", + force_build: System.get_env("RUSTLER_PRECOMPILATION_EXCODING_BUILD") in ["1", "true"], + mode: if(Mix.env() == :prod, do: :release, else: :debug), + targets: + Enum.uniq(["aarch64-unknown-linux-musl" | RustlerPrecompiled.Config.default_targets()]), + version: version + + # Decode functions (normal and dirty scheduler versions) + def decode_normal(_binary, _encoding), do: :erlang.nif_error(:nif_not_loaded) + def decode_dirty(_binary, _encoding), do: :erlang.nif_error(:nif_not_loaded) + + # Encode functions (normal and dirty scheduler versions) + def encode_normal(_string, _encoding), do: :erlang.nif_error(:nif_not_loaded) + def encode_dirty(_string, _encoding), do: :erlang.nif_error(:nif_not_loaded) + + # Utility functions + def dirty_threshold, do: :erlang.nif_error(:nif_not_loaded) + def encoding_exists(_encoding), do: :erlang.nif_error(:nif_not_loaded) + def canonical_name(_encoding), do: :erlang.nif_error(:nif_not_loaded) + def list_encodings, do: :erlang.nif_error(:nif_not_loaded) +end diff --git a/mix.exs b/mix.exs index eb8aaf8..fa62af9 100644 --- a/mix.exs +++ b/mix.exs @@ -1,34 +1,33 @@ defmodule Excoding.MixProject do use Mix.Project - @version "0.1.5" + @version "0.2.0" def project do [ app: :excoding, version: @version, - elixir: "~> 1.9", + elixir: "~> 1.12", start_permanent: Mix.env() == :prod, deps: deps(), description: - "String encoding/decoding NIF using rust [encoding](https://crates.io/crates/encoding) library", - package: package() + "High-performance string encoding/decoding NIF using Rust's encoding_rs (Firefox's encoding library)", + package: package(), + docs: docs() ] end - # Run "mix help compile.app" to learn about applications. def application do [ extra_applications: [:logger] ] end - # Run "mix help deps" to learn about dependencies. defp deps do [ - {:rustler_precompiled, "~> 0.5"}, - {:rustler, ">= 0.0.0", optional: true}, - {:ex_doc, ">= 0.0.0", only: :dev, runtime: false} + {:rustler_precompiled, "~> 0.8"}, + {:rustler, "~> 0.37", optional: true}, + {:ex_doc, "~> 0.31", only: :dev, runtime: false} ] end @@ -46,7 +45,15 @@ defmodule Excoding.MixProject do "checksum-*.exs", "mix.exs", "README.md", + "CHANGELOG.md" ] ] end + + defp docs do + [ + main: "Excoding", + extras: ["README.md", "CHANGELOG.md"] + ] + end end diff --git a/mix.lock b/mix.lock index 215f82e..8394429 100644 --- a/mix.lock +++ b/mix.lock @@ -1,13 +1,13 @@ %{ - "castore": {:hex, :castore, "1.0.1", "240b9edb4e9e94f8f56ab39d8d2d0a57f49e46c56aced8f873892df8ff64ff5a", [:mix], [], "hexpm", "b4951de93c224d44fac71614beabd88b71932d0b1dea80d2f80fb9044e01bbb3"}, - "earmark_parser": {:hex, :earmark_parser, "1.4.32", "fa739a0ecfa34493de19426681b23f6814573faee95dfd4b4aafe15a7b5b32c6", [:mix], [], "hexpm", "b8b0dd77d60373e77a3d7e8afa598f325e49e8663a51bcc2b88ef41838cca755"}, - "ex_doc": {:hex, :ex_doc, "0.29.4", "6257ecbb20c7396b1fe5accd55b7b0d23f44b6aa18017b415cb4c2b91d997729", [:mix], [{:earmark_parser, "~> 1.4.31", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "2c6699a737ae46cb61e4ed012af931b57b699643b24dabe2400a8168414bc4f5"}, - "jason": {:hex, :jason, "1.4.0", "e855647bc964a44e2f67df589ccf49105ae039d4179db7f6271dfd3843dc27e6", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "79a3791085b2a0f743ca04cec0f7be26443738779d09302e01318f97bdb82121"}, - "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"}, - "makeup_elixir": {:hex, :makeup_elixir, "0.16.1", "cc9e3ca312f1cfeccc572b37a09980287e243648108384b97ff2b76e505c3555", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "e127a341ad1b209bd80f7bd1620a15693a9908ed780c3b763bccf7d200c767c6"}, - "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, - "nimble_parsec": {:hex, :nimble_parsec, "1.3.1", "2c54013ecf170e249e9291ed0a62e5832f70a476c61da16f6aac6dca0189f2af", [:mix], [], "hexpm", "2682e3c0b2eb58d90c6375fc0cc30bc7be06f365bf72608804fb9cffa5e1b167"}, - "rustler": {:hex, :rustler, "0.29.1", "880f20ae3027bd7945def6cea767f5257bc926f33ff50c0d5d5a5315883c084d", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "109497d701861bfcd26eb8f5801fe327a8eef304f56a5b63ef61151ff44ac9b6"}, - "rustler_precompiled": {:hex, :rustler_precompiled, "0.6.1", "160b545bce8bf9a3f1b436b2c10f53574036a0db628e40f393328cbbe593602f", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "0dd269fa261c4e3df290b12031c575fff07a542749f7b0e8b744d72d66c43600"}, + "castore": {:hex, :castore, "1.0.17", "4f9770d2d45fbd91dcf6bd404cf64e7e58fed04fadda0923dc32acca0badffa2", [:mix], [], "hexpm", "12d24b9d80b910dd3953e165636d68f147a31db945d2dcb9365e441f8b5351e5"}, + "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"}, + "ex_doc": {:hex, :ex_doc, "0.40.0", "2635974389b80fd3ca61b0f993d459dad05b4a8f9b069dcfbbc5f6a8a6aef60e", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "c040735250e2752b6e1102eeb4aa3f1dca74c316db873ae09f955d42136e7e5b"}, + "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, + "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"}, + "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"}, + "makeup_erlang": {:hex, :makeup_erlang, "1.0.3", "4252d5d4098da7415c390e847c814bad3764c94a814a0b4245176215615e1035", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "953297c02582a33411ac6208f2c6e55f0e870df7f80da724ed613f10e6706afd"}, + "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"}, + "rustler": {:hex, :rustler, "0.37.1", "721434020c7f6f8e1cdc57f44f75c490435b01de96384f8ccb96043f12e8a7e0", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "24547e9b8640cf00e6a2071acb710f3e12ce0346692e45098d84d45cdb54fd79"}, + "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.4", "700a878312acfac79fb6c572bb8b57f5aae05fe1cf70d34b5974850bbf2c05bf", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "3b33d99b540b15f142ba47944f7a163a25069f6d608783c321029bc1ffb09514"}, "toml": {:hex, :toml, "0.7.0", "fbcd773caa937d0c7a02c301a1feea25612720ac3fa1ccb8bfd9d30d822911de", [:mix], [], "hexpm", "0690246a2478c1defd100b0c9b89b4ea280a22be9a7b313a8a058a2408a2fa70"}, } diff --git a/native/excoding/.cargo/config b/native/excoding/.cargo/config.toml similarity index 100% rename from native/excoding/.cargo/config rename to native/excoding/.cargo/config.toml diff --git a/native/excoding/Cargo.lock b/native/excoding/Cargo.lock index 170f342..1dd2f17 100644 --- a/native/excoding/Cargo.lock +++ b/native/excoding/Cargo.lock @@ -1,108 +1,55 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] -name = "aho-corasick" -version = "1.0.2" +name = "cfg-if" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" -dependencies = [ - "memchr", -] - -[[package]] -name = "encoding" -version = "0.2.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" -dependencies = [ - "encoding-index-japanese", - "encoding-index-korean", - "encoding-index-simpchinese", - "encoding-index-singlebyte", - "encoding-index-tradchinese", -] - -[[package]] -name = "encoding-index-japanese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" -dependencies = [ - "encoding_index_tests", -] +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] -name = "encoding-index-korean" -version = "1.20141219.5" +name = "encoding_rs" +version = "0.8.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" dependencies = [ - "encoding_index_tests", + "cfg-if", ] [[package]] -name = "encoding-index-simpchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +name = "excoding" +version = "0.2.0" dependencies = [ - "encoding_index_tests", + "encoding_rs", + "rustler", ] [[package]] -name = "encoding-index-singlebyte" -version = "1.20141219.5" +name = "heck" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" -dependencies = [ - "encoding_index_tests", -] +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] -name = "encoding-index-tradchinese" -version = "1.20141219.5" +name = "inventory" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +checksum = "bc61209c082fbeb19919bee74b176221b27223e27b65d781eb91af24eb1fb46e" dependencies = [ - "encoding_index_tests", + "rustversion", ] [[package]] -name = "encoding_index_tests" -version = "0.1.4" +name = "libloading" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" - -[[package]] -name = "excoding" -version = "0.1.0" +checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" dependencies = [ - "encoding", - "lazy_static", - "rustler", - "rustler_codegen", + "cfg-if", + "windows-link", ] -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "memchr" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" - [[package]] name = "proc-macro2" version = "1.0.63" @@ -122,54 +69,41 @@ dependencies = [ ] [[package]] -name = "regex" -version = "1.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.7.2" +name = "regex-lite" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" +checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da" [[package]] name = "rustler" -version = "0.29.1" +version = "0.37.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0884cb623b9f43d3e2c51f9071c5e96a5acf3e6e6007866812884ff0cb983f1e" +checksum = "a5c708d8b686a8d426681908369f835af90349f7ebb92ab87ddf14a851efd556" dependencies = [ - "lazy_static", + "inventory", + "libloading", + "regex-lite", "rustler_codegen", - "rustler_sys", ] [[package]] name = "rustler_codegen" -version = "0.29.1" +version = "0.37.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50e277af754f2560cf4c4ebedb68c1a735292fb354505c6133e47ec406e699cf" +checksum = "da3f478ec72581782a7dd62a5adb406aa076af7cedd7de63fa3676c927eb216a" dependencies = [ "heck", + "inventory", "proc-macro2", "quote", "syn", ] [[package]] -name = "rustler_sys" -version = "2.3.0" +name = "rustversion" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b311902a5c224452d0d8e1821a4b682d43f58248a2c559f613a7b52e0980075" -dependencies = [ - "regex", - "unreachable", -] +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "syn" @@ -189,16 +123,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73" [[package]] -name = "unreachable" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" -dependencies = [ - "void", -] - -[[package]] -name = "void" -version = "1.0.2" +name = "windows-link" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" diff --git a/native/excoding/Cargo.toml b/native/excoding/Cargo.toml index 2315d88..e6e8ee2 100644 --- a/native/excoding/Cargo.toml +++ b/native/excoding/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "excoding" -version = "0.1.0" +version = "0.2.0" authors = [] -edition = "2018" +edition = "2021" [lib] name = "excoding" @@ -10,7 +10,5 @@ path = "src/lib.rs" crate-type = ["cdylib"] [dependencies] -rustler = "0.29.1" -rustler_codegen = "0.29.1" -lazy_static = "1.4" -encoding = "0.2.33" +rustler = "0.37" +encoding_rs = "0.8" diff --git a/native/excoding/src/lib.rs b/native/excoding/src/lib.rs index 4655b6c..7ddea93 100644 --- a/native/excoding/src/lib.rs +++ b/native/excoding/src/lib.rs @@ -1,53 +1,197 @@ -#[macro_use] -extern crate rustler; -extern crate rustler_codegen; -extern crate encoding; -extern crate lazy_static; - -use encoding::label::encoding_from_whatwg_label; -use encoding::{DecoderTrap, EncoderTrap}; - -use rustler::types::binary::{Binary, OwnedBinary}; -use rustler::Error; -use rustler::{Encoder, Env, NifResult, Term}; +//! String encoding/decoding NIF for Elixir using Rust's encoding_rs crate. +//! +//! This is a high-performance implementation based on encoding_rs, the same +//! encoding library used by Firefox. It provides fast character encoding +//! conversion for all encodings in the WHATWG Encoding Standard. +//! +//! Supported encodings include: +//! - UTF-8, UTF-16LE, UTF-16BE +//! - Windows code pages (1250-1258, 874, 949, 932) +//! - ISO-8859 family (1-16) +//! - Asian encodings (Shift_JIS, EUC-JP, EUC-KR, GBK, GB18030, Big5) +//! - And many more from the WHATWG Encoding Standard +use encoding_rs::Encoding; +use rustler::{Atom, Binary, Env, NifResult, OwnedBinary}; use std::io::Write; mod atoms { - atoms! { + rustler::atoms! { ok, error, unknown_encoding, - //atom __true__ = "true"; - //atom __false__ = "false"; + encode_error, + decode_error, } } -rustler::init!("Elixir.Excoding", [encode, decode]); +/// Threshold for using dirty schedulers (64KB) +const DIRTY_THRESHOLD: usize = 64 * 1024; + +/// Decodes a binary from the specified encoding to a UTF-8 string. +/// +/// Uses dirty CPU scheduler for binaries larger than 64KB to avoid +/// blocking the BEAM scheduler. +/// +/// ## Arguments +/// * `in_binary` - The binary data to decode +/// * `enc` - The source encoding label (WHATWG format, e.g., "windows-1252") +/// +/// ## Returns +/// * `{:ok, string}` on success +/// * `{:error, :unknown_encoding}` if encoding label is not recognized +#[rustler::nif(schedule = "DirtyCpu")] +fn decode_dirty<'a>(env: Env<'a>, in_binary: Binary, enc: &str) -> NifResult<(Atom, String)> { + decode_impl(env, in_binary, enc) +} #[rustler::nif] -fn decode<'a>(env: Env<'a>, in_binary: Binary, enc: String) -> NifResult> { - match encoding_from_whatwg_label(&enc) { +fn decode_normal<'a>(env: Env<'a>, in_binary: Binary, enc: &str) -> NifResult<(Atom, String)> { + decode_impl(env, in_binary, enc) +} + +fn decode_impl<'a>(_env: Env<'a>, in_binary: Binary, enc: &str) -> NifResult<(Atom, String)> { + match Encoding::for_label(enc.as_bytes()) { Some(encoding) => { - let in_str = in_binary.to_owned().unwrap(); - let res = encoding - .decode(in_str.as_slice(), DecoderTrap::Replace) - .unwrap(); - Ok(res.encode(env)) + let (decoded, _, _had_errors) = encoding.decode(in_binary.as_slice()); + // encoding_rs replaces unmappable characters with U+FFFD automatically + Ok((atoms::ok(), decoded.into_owned())) } - None => Err(Error::BadArg) + None => Ok((atoms::error(), "unknown_encoding".to_string())), } } +/// Encodes a UTF-8 string to the specified encoding. +/// +/// Uses dirty CPU scheduler for strings larger than 64KB to avoid +/// blocking the BEAM scheduler. +/// +/// ## Arguments +/// * `env` - The Erlang environment +/// * `in_str` - The UTF-8 string to encode +/// * `enc` - The target encoding label (WHATWG format, e.g., "shift_jis") +/// +/// ## Returns +/// * `{:ok, binary}` on success +/// * `{:error, :unknown_encoding}` if encoding label is not recognized +#[rustler::nif(schedule = "DirtyCpu")] +fn encode_dirty<'a>(env: Env<'a>, in_str: &str, enc: &str) -> NifResult<(Atom, Binary<'a>)> { + encode_impl(env, in_str, enc) +} + #[rustler::nif] -fn encode<'a>(env: Env<'a>, in_str: &str, enc: String) -> NifResult> { - match encoding_from_whatwg_label(&enc) { +fn encode_normal<'a>(env: Env<'a>, in_str: &str, enc: &str) -> NifResult<(Atom, Binary<'a>)> { + encode_impl(env, in_str, enc) +} + +fn encode_impl<'a>(env: Env<'a>, in_str: &str, enc: &str) -> NifResult<(Atom, Binary<'a>)> { + match Encoding::for_label(enc.as_bytes()) { Some(encoding) => { - let enc_bin = encoding.encode(in_str, EncoderTrap::Replace).unwrap(); - let mut bin = OwnedBinary::new(enc_bin.len()).unwrap(); - bin.as_mut_slice().write_all(&enc_bin).unwrap(); - Ok(bin.release(env).encode(env)) + let (encoded, _, _had_errors) = encoding.encode(in_str); + // encoding_rs replaces unmappable characters automatically + + let mut bin = OwnedBinary::new(encoded.len()) + .ok_or_else(|| rustler::Error::Term(Box::new("allocation_failed")))?; + bin.as_mut_slice() + .write_all(&encoded) + .map_err(|_| rustler::Error::Term(Box::new("write_failed")))?; + + Ok((atoms::ok(), bin.release(env))) + } + None => { + // Return empty binary for error case + let bin = OwnedBinary::new(0) + .ok_or_else(|| rustler::Error::Term(Box::new("allocation_failed")))?; + Ok((atoms::error(), bin.release(env))) } - None => Err(Error::BadArg) } } + +/// Returns the dirty threshold size in bytes. +/// Binaries/strings larger than this will use dirty schedulers. +#[rustler::nif] +fn dirty_threshold() -> usize { + DIRTY_THRESHOLD +} + +/// Checks if an encoding label is valid/supported. +/// +/// ## Arguments +/// * `enc` - The encoding label to check +/// +/// ## Returns +/// * `true` if the encoding is supported +/// * `false` otherwise +#[rustler::nif] +fn encoding_exists(enc: &str) -> bool { + Encoding::for_label(enc.as_bytes()).is_some() +} + +/// Returns the canonical name for an encoding label. +/// +/// ## Arguments +/// * `enc` - The encoding label (can be an alias) +/// +/// ## Returns +/// * `{:ok, name}` with the canonical encoding name +/// * `{:error, :unknown_encoding}` if not recognized +#[rustler::nif] +fn canonical_name(enc: &str) -> (Atom, String) { + match Encoding::for_label(enc.as_bytes()) { + Some(encoding) => (atoms::ok(), encoding.name().to_string()), + None => (atoms::error(), "unknown_encoding".to_string()), + } +} + +/// Lists all supported encoding names. +/// +/// ## Returns +/// A list of all canonical encoding names supported by this library. +#[rustler::nif] +fn list_encodings() -> Vec<&'static str> { + // encoding_rs doesn't expose a list, so we provide the WHATWG standard ones + vec![ + "UTF-8", + "IBM866", + "ISO-8859-2", + "ISO-8859-3", + "ISO-8859-4", + "ISO-8859-5", + "ISO-8859-6", + "ISO-8859-7", + "ISO-8859-8", + "ISO-8859-8-I", + "ISO-8859-10", + "ISO-8859-13", + "ISO-8859-14", + "ISO-8859-15", + "ISO-8859-16", + "KOI8-R", + "KOI8-U", + "macintosh", + "windows-874", + "windows-1250", + "windows-1251", + "windows-1252", + "windows-1253", + "windows-1254", + "windows-1255", + "windows-1256", + "windows-1257", + "windows-1258", + "x-mac-cyrillic", + "GBK", + "gb18030", + "Big5", + "EUC-JP", + "ISO-2022-JP", + "Shift_JIS", + "EUC-KR", + "replacement", + "UTF-16BE", + "UTF-16LE", + "x-user-defined", + ] +} + +rustler::init!("Elixir.Excoding.Native"); diff --git a/test/excoding_test.exs b/test/excoding_test.exs index b9d5dbe..8a8ebcf 100644 --- a/test/excoding_test.exs +++ b/test/excoding_test.exs @@ -1,24 +1,152 @@ defmodule ExcodingTest do use ExUnit.Case - import Excoding doctest Excoding - test "should decode string correctly" do - assert "caf\u00e9" == decode(<<99, 97, 102, 233>>, "iso-8859-1") + describe "decode/2" do + test "decodes ISO-8859-1" do + assert "café" == Excoding.decode(<<99, 97, 102, 233>>, "iso-8859-1") + end - assert "Hello, 世界!" == - decode(<<72, 101, 108, 108, 111, 44, 32, 225, 166, 205, 163, 33>>, "windows-949") + test "decodes Windows-949 (Korean)" do + assert "Hello, 世界!" == + Excoding.decode( + <<72, 101, 108, 108, 111, 44, 32, 225, 166, 205, 163, 33>>, + "windows-949" + ) + end - assert <<158, 232, 240, 232, 235, 232, 246, 224>> = encode("ћирилица", "windows-1251") - assert "ћирилица" = decode(<<158, 232, 240, 232, 235, 232, 246, 224>>, "windows-1251") + test "decodes Windows-1251 (Cyrillic)" do + assert "ћирилица" == + Excoding.decode(<<158, 232, 240, 232, 235, 232, 246, 224>>, "windows-1251") + end + + test "replaces invalid bytes with replacement character" do + assert "\u{c6b0}\u{c640}\u{fffd}\u{c559}" == + Excoding.decode(<<0xBF, 0xEC, 0xBF, 0xCD, 0xFF, 0xBE, 0xD3>>, "windows-949") + + # Same with alias + assert "\u{c6b0}\u{c640}\u{fffd}\u{c559}" == + Excoding.decode(<<0xBF, 0xEC, 0xBF, 0xCD, 0xFF, 0xBE, 0xD3>>, "euc-kr") + end + + test "raises on unknown encoding" do + assert_raise ArgumentError, ~r/unknown encoding/, fn -> + Excoding.decode(<<1, 2, 3>>, "not-an-encoding") + end + end + end + + describe "safe_decode/2" do + test "returns ok tuple on success" do + assert {:ok, "café"} == Excoding.safe_decode(<<99, 97, 102, 233>>, "iso-8859-1") + end + + test "returns error tuple on unknown encoding" do + assert {:error, :unknown_encoding} == Excoding.safe_decode(<<1, 2, 3>>, "not-an-encoding") + end + end + + describe "encode/2" do + test "encodes to Windows-1251 (Cyrillic)" do + assert <<158, 232, 240, 232, 235, 232, 246, 224>> == + Excoding.encode("ћирилица", "windows-1251") + end + + test "encodes to Windows-1255 (Hebrew)" do + assert <<165, 164, 249>> == Excoding.encode("¥₪ש", "windows-1255") + end + + test "raises on unknown encoding" do + assert_raise ArgumentError, ~r/unknown encoding/, fn -> + Excoding.encode("hello", "not-an-encoding") + end + end + end + + describe "safe_encode/2" do + test "returns ok tuple on success" do + assert {:ok, <<158, 232, 240, 232, 235, 232, 246, 224>>} == + Excoding.safe_encode("ћирилица", "windows-1251") + end + + test "returns error tuple on unknown encoding" do + assert {:error, :unknown_encoding} == Excoding.safe_encode("hello", "not-an-encoding") + end end - test "should decode broken string and replace unkinwn with `�` code" do - assert "\u{c6b0}\u{c640}\u{fffd}\u{c559}" == - decode(<<0xBF, 0xEC, 0xBF, 0xCD, 0xFF, 0xBE, 0xD3>>, "windows-949") + describe "encoding_exists?/1" do + test "returns true for valid encodings" do + assert Excoding.encoding_exists?("utf-8") + assert Excoding.encoding_exists?("UTF-8") + assert Excoding.encoding_exists?("windows-1252") + assert Excoding.encoding_exists?("shift_jis") + end + + test "returns false for invalid encodings" do + refute Excoding.encoding_exists?("not-an-encoding") + refute Excoding.encoding_exists?("") + end + end + + describe "canonical_name/1" do + test "returns canonical name for aliases" do + assert {:ok, "UTF-8"} == Excoding.canonical_name("utf-8") + assert {:ok, "UTF-8"} == Excoding.canonical_name("utf8") + assert {:ok, "windows-1252"} == Excoding.canonical_name("latin1") + end + + test "returns error for invalid encodings" do + assert {:error, :unknown_encoding} == Excoding.canonical_name("not-an-encoding") + end + end + + describe "list_encodings/0" do + test "returns list of encodings" do + encodings = Excoding.list_encodings() + assert is_list(encodings) + assert "UTF-8" in encodings + assert "Shift_JIS" in encodings + assert "windows-1252" in encodings + end + end + + describe "dirty_threshold/0" do + test "returns threshold value" do + assert Excoding.dirty_threshold() == 64 * 1024 + end + end + + describe "roundtrip" do + test "encode then decode returns original for UTF-8" do + original = "Hello, 世界! 🎉" + encoded = Excoding.encode(original, "utf-8") + decoded = Excoding.decode(encoded, "utf-8") + assert decoded == original + end + + test "encode then decode for single-byte encodings" do + # ASCII subset works for all single-byte encodings + original = "Hello World" + + for encoding <- ["windows-1252", "iso-8859-1", "windows-1251"] do + encoded = Excoding.encode(original, encoding) + decoded = Excoding.decode(encoded, encoding) + assert decoded == original, "Roundtrip failed for #{encoding}" + end + end + + test "encode then decode for Asian encodings" do + # Japanese text roundtrip through Shift_JIS + original = "こんにちは" + encoded = Excoding.encode(original, "shift_jis") + decoded = Excoding.decode(encoded, "shift_jis") + assert decoded == original - # same encoding/decoding as above but ecoding name is different - assert "\u{c6b0}\u{c640}\u{fffd}\u{c559}" == - decode(<<0xBF, 0xEC, 0xBF, 0xCD, 0xFF, 0xBE, 0xD3>>, "euc-kr") + # Korean text roundtrip through EUC-KR + original_kr = "안녕하세요" + encoded_kr = Excoding.encode(original_kr, "euc-kr") + decoded_kr = Excoding.decode(encoded_kr, "euc-kr") + assert decoded_kr == original_kr + end end end From 48fc309f0d3f99e9f7af34e612e4e3a5169d402e Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:00:59 -0800 Subject: [PATCH 02/15] Use standard Elixir API convention (breaking change) - encode/2 and decode/2 now return {:ok, result} | {:error, reason} - encode!/2 and decode!/2 return raw value or raise - Removed safe_encode/safe_decode (redundant with new API) This follows standard Elixir conventions for fallible operations. Co-Authored-By: Claude Opus 4.5 --- CHANGELOG.md | 15 +++--- lib/excoding.ex | 107 ++++++++++++++++++----------------------- test/excoding_test.exs | 68 +++++++++++++------------- 3 files changed, 89 insertions(+), 101 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d79814b..31054ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,19 @@ ## v0.2.0 (2025-01-22) +### Breaking Changes + +- `encode/2` now returns `{:ok, binary}` or `{:error, reason}` (previously returned raw binary) +- `decode/2` now returns `{:ok, string}` or `{:error, reason}` (previously returned raw string) +- Use `encode!/2` and `decode!/2` for the old behavior (returns raw value, raises on error) + ### New Features - **Switched to `encoding_rs`**: Now uses the same encoding library as Firefox for better performance and active maintenance - **Dirty schedulers**: Operations on binaries larger than 64KB automatically use dirty CPU schedulers to avoid blocking the BEAM - **New functions**: - - `safe_encode/2` - Encode with `{:ok, result}` / `{:error, reason}` return - - `safe_decode/2` - Decode with `{:ok, result}` / `{:error, reason}` return + - `encode!/2` - Encode, returns raw value or raises + - `decode!/2` - Decode, returns raw value or raises - `encoding_exists?/1` - Check if an encoding is supported - `canonical_name/1` - Get the canonical WHATWG name for an encoding alias - `list_encodings/0` - List all supported encodings @@ -25,11 +31,6 @@ - Fixes OTP-26+ compilation issues (#40) - Fixes panic on error (#24) -### Backwards Compatibility - -- `encode/2` and `decode/2` maintain the same behavior as v0.1.x -- Use `safe_encode/2` and `safe_decode/2` for tuple-based error handling - ## v0.1.5 and earlier See [GitHub releases](https://github.com/elixir-ecto/excoding/releases) for previous versions. diff --git a/lib/excoding.ex b/lib/excoding.ex index d6e5447..721f496 100644 --- a/lib/excoding.ex +++ b/lib/excoding.ex @@ -11,6 +11,7 @@ defmodule Excoding do - **High performance**: Uses `encoding_rs`, the same library used by Firefox - **Dirty schedulers**: Large binaries (>64KB) automatically use dirty CPU schedulers to avoid blocking the BEAM + - **Safe error handling**: Returns `{:ok, result}` or `{:error, reason}` tuples - **WHATWG compliant**: Supports all encodings from the WHATWG Encoding Standard ## Supported Encodings @@ -26,18 +27,18 @@ defmodule Excoding do ## Examples - iex> Excoding.encode("¥₪ש", "windows-1255") - <<165, 164, 249>> - - iex> Excoding.decode(<<165, 164, 249>>, "windows-1255") - "¥₪ש" - - iex> Excoding.safe_encode("Hello", "windows-1252") + iex> Excoding.encode("Hello", "windows-1252") {:ok, "Hello"} - iex> Excoding.safe_decode(<<72, 101, 108, 108, 111>>, "windows-1252") + iex> Excoding.decode(<<72, 101, 108, 108, 111>>, "windows-1252") {:ok, "Hello"} + iex> Excoding.encode!("¥₪ש", "windows-1255") + <<165, 164, 249>> + + iex> Excoding.decode!(<<165, 164, 249>>, "windows-1255") + "¥₪ש" + iex> Excoding.encoding_exists?("utf-8") true @@ -50,34 +51,6 @@ defmodule Excoding do @doc """ Encodes a UTF-8 string to the specified encoding. - Returns the encoded binary on success, or raises `ArgumentError` if the - encoding is not recognized. Unmappable characters are replaced with a - suitable fallback character. - - Automatically uses dirty CPU schedulers for strings larger than 64KB. - - ## Examples - - iex> Excoding.encode("Hello", "windows-1252") - "Hello" - - iex> Excoding.encode("ћирилица", "windows-1251") - <<158, 232, 240, 232, 235, 232, 246, 224>> - - iex> Excoding.encode("Hello", "invalid-encoding") - ** (ArgumentError) unknown encoding: invalid-encoding - """ - @spec encode(String.t(), String.t()) :: binary() - def encode(string, encoding) when is_binary(string) and is_binary(encoding) do - case safe_encode(string, encoding) do - {:ok, binary} -> binary - {:error, :unknown_encoding} -> raise ArgumentError, "unknown encoding: #{encoding}" - end - end - - @doc """ - Encodes a UTF-8 string to the specified encoding, returning a tuple. - Returns `{:ok, binary}` on success, or `{:error, reason}` on failure. Unmappable characters are replaced with a suitable fallback character. @@ -85,14 +58,14 @@ defmodule Excoding do ## Examples - iex> Excoding.safe_encode("Hello", "windows-1252") + iex> Excoding.encode("Hello", "windows-1252") {:ok, "Hello"} - iex> Excoding.safe_encode("Hello", "invalid-encoding") + iex> Excoding.encode("Hello", "invalid-encoding") {:error, :unknown_encoding} """ - @spec safe_encode(String.t(), String.t()) :: {:ok, binary()} | {:error, atom()} - def safe_encode(string, encoding) when is_binary(string) and is_binary(encoding) do + @spec encode(String.t(), String.t()) :: {:ok, binary()} | {:error, atom()} + def encode(string, encoding) when is_binary(string) and is_binary(encoding) do if byte_size(string) > Native.dirty_threshold() do case Native.encode_dirty(string, encoding) do {:ok, binary} -> {:ok, binary} @@ -107,35 +80,28 @@ defmodule Excoding do end @doc """ - Decodes a binary from the specified encoding to a UTF-8 string. - - Returns the decoded string on success, or raises `ArgumentError` if the - encoding is not recognized. Unmappable bytes are replaced with the Unicode - replacement character (U+FFFD). + Encodes a UTF-8 string to the specified encoding. - Automatically uses dirty CPU schedulers for binaries larger than 64KB. + Returns the encoded binary on success, or raises an `ArgumentError` on failure. ## Examples - iex> Excoding.decode(<<72, 101, 108, 108, 111>>, "windows-1252") + iex> Excoding.encode!("Hello", "windows-1252") "Hello" - iex> Excoding.decode(<<158, 232, 240, 232, 235, 232, 246, 224>>, "windows-1251") - "ћирилица" - - iex> Excoding.decode(<<0xFF>>, "invalid-encoding") + iex> Excoding.encode!("Hello", "invalid-encoding") ** (ArgumentError) unknown encoding: invalid-encoding """ - @spec decode(binary(), String.t()) :: String.t() - def decode(binary, encoding) when is_binary(binary) and is_binary(encoding) do - case safe_decode(binary, encoding) do - {:ok, string} -> string + @spec encode!(String.t(), String.t()) :: binary() + def encode!(string, encoding) when is_binary(string) and is_binary(encoding) do + case encode(string, encoding) do + {:ok, binary} -> binary {:error, :unknown_encoding} -> raise ArgumentError, "unknown encoding: #{encoding}" end end @doc """ - Decodes a binary from the specified encoding to a UTF-8 string, returning a tuple. + Decodes a binary from the specified encoding to a UTF-8 string. Returns `{:ok, string}` on success, or `{:error, reason}` on failure. Unmappable bytes are replaced with the Unicode replacement character (U+FFFD). @@ -144,14 +110,14 @@ defmodule Excoding do ## Examples - iex> Excoding.safe_decode(<<72, 101, 108, 108, 111>>, "windows-1252") + iex> Excoding.decode(<<72, 101, 108, 108, 111>>, "windows-1252") {:ok, "Hello"} - iex> Excoding.safe_decode(<<0xFF>>, "invalid-encoding") + iex> Excoding.decode(<<0xFF>>, "invalid-encoding") {:error, :unknown_encoding} """ - @spec safe_decode(binary(), String.t()) :: {:ok, String.t()} | {:error, atom()} - def safe_decode(binary, encoding) when is_binary(binary) and is_binary(encoding) do + @spec decode(binary(), String.t()) :: {:ok, String.t()} | {:error, atom()} + def decode(binary, encoding) when is_binary(binary) and is_binary(encoding) do if byte_size(binary) > Native.dirty_threshold() do case Native.decode_dirty(binary, encoding) do {:ok, string} -> {:ok, string} @@ -165,6 +131,27 @@ defmodule Excoding do end end + @doc """ + Decodes a binary from the specified encoding to a UTF-8 string. + + Returns the decoded string on success, or raises an `ArgumentError` on failure. + + ## Examples + + iex> Excoding.decode!(<<72, 101, 108, 108, 111>>, "windows-1252") + "Hello" + + iex> Excoding.decode!(<<0xFF>>, "invalid-encoding") + ** (ArgumentError) unknown encoding: invalid-encoding + """ + @spec decode!(binary(), String.t()) :: String.t() + def decode!(binary, encoding) when is_binary(binary) and is_binary(encoding) do + case decode(binary, encoding) do + {:ok, string} -> string + {:error, :unknown_encoding} -> raise ArgumentError, "unknown encoding: #{encoding}" + end + end + @doc """ Checks if an encoding label is valid and supported. diff --git a/test/excoding_test.exs b/test/excoding_test.exs index 8a8ebcf..8f313ff 100644 --- a/test/excoding_test.exs +++ b/test/excoding_test.exs @@ -4,11 +4,11 @@ defmodule ExcodingTest do describe "decode/2" do test "decodes ISO-8859-1" do - assert "café" == Excoding.decode(<<99, 97, 102, 233>>, "iso-8859-1") + assert {:ok, "café"} == Excoding.decode(<<99, 97, 102, 233>>, "iso-8859-1") end test "decodes Windows-949 (Korean)" do - assert "Hello, 世界!" == + assert {:ok, "Hello, 世界!"} == Excoding.decode( <<72, 101, 108, 108, 111, 44, 32, 225, 166, 205, 163, 33>>, "windows-949" @@ -16,61 +16,61 @@ defmodule ExcodingTest do end test "decodes Windows-1251 (Cyrillic)" do - assert "ћирилица" == + assert {:ok, "ћирилица"} == Excoding.decode(<<158, 232, 240, 232, 235, 232, 246, 224>>, "windows-1251") end test "replaces invalid bytes with replacement character" do - assert "\u{c6b0}\u{c640}\u{fffd}\u{c559}" == + assert {:ok, "\u{c6b0}\u{c640}\u{fffd}\u{c559}"} == Excoding.decode(<<0xBF, 0xEC, 0xBF, 0xCD, 0xFF, 0xBE, 0xD3>>, "windows-949") # Same with alias - assert "\u{c6b0}\u{c640}\u{fffd}\u{c559}" == + assert {:ok, "\u{c6b0}\u{c640}\u{fffd}\u{c559}"} == Excoding.decode(<<0xBF, 0xEC, 0xBF, 0xCD, 0xFF, 0xBE, 0xD3>>, "euc-kr") end - test "raises on unknown encoding" do - assert_raise ArgumentError, ~r/unknown encoding/, fn -> - Excoding.decode(<<1, 2, 3>>, "not-an-encoding") - end + test "returns error for unknown encoding" do + assert {:error, :unknown_encoding} == Excoding.decode(<<1, 2, 3>>, "not-an-encoding") end end - describe "safe_decode/2" do - test "returns ok tuple on success" do - assert {:ok, "café"} == Excoding.safe_decode(<<99, 97, 102, 233>>, "iso-8859-1") + describe "decode!/2" do + test "returns decoded string directly" do + assert "café" == Excoding.decode!(<<99, 97, 102, 233>>, "iso-8859-1") end - test "returns error tuple on unknown encoding" do - assert {:error, :unknown_encoding} == Excoding.safe_decode(<<1, 2, 3>>, "not-an-encoding") + test "raises on unknown encoding" do + assert_raise ArgumentError, ~r/unknown encoding/, fn -> + Excoding.decode!(<<1, 2, 3>>, "not-an-encoding") + end end end describe "encode/2" do test "encodes to Windows-1251 (Cyrillic)" do - assert <<158, 232, 240, 232, 235, 232, 246, 224>> == + assert {:ok, <<158, 232, 240, 232, 235, 232, 246, 224>>} == Excoding.encode("ћирилица", "windows-1251") end test "encodes to Windows-1255 (Hebrew)" do - assert <<165, 164, 249>> == Excoding.encode("¥₪ש", "windows-1255") + assert {:ok, <<165, 164, 249>>} == Excoding.encode("¥₪ש", "windows-1255") end - test "raises on unknown encoding" do - assert_raise ArgumentError, ~r/unknown encoding/, fn -> - Excoding.encode("hello", "not-an-encoding") - end + test "returns error for unknown encoding" do + assert {:error, :unknown_encoding} == Excoding.encode("hello", "not-an-encoding") end end - describe "safe_encode/2" do - test "returns ok tuple on success" do - assert {:ok, <<158, 232, 240, 232, 235, 232, 246, 224>>} == - Excoding.safe_encode("ћирилица", "windows-1251") + describe "encode!/2" do + test "returns encoded binary directly" do + assert <<158, 232, 240, 232, 235, 232, 246, 224>> == + Excoding.encode!("ћирилица", "windows-1251") end - test "returns error tuple on unknown encoding" do - assert {:error, :unknown_encoding} == Excoding.safe_encode("hello", "not-an-encoding") + test "raises on unknown encoding" do + assert_raise ArgumentError, ~r/unknown encoding/, fn -> + Excoding.encode!("hello", "not-an-encoding") + end end end @@ -119,8 +119,8 @@ defmodule ExcodingTest do describe "roundtrip" do test "encode then decode returns original for UTF-8" do original = "Hello, 世界! 🎉" - encoded = Excoding.encode(original, "utf-8") - decoded = Excoding.decode(encoded, "utf-8") + {:ok, encoded} = Excoding.encode(original, "utf-8") + {:ok, decoded} = Excoding.decode(encoded, "utf-8") assert decoded == original end @@ -129,8 +129,8 @@ defmodule ExcodingTest do original = "Hello World" for encoding <- ["windows-1252", "iso-8859-1", "windows-1251"] do - encoded = Excoding.encode(original, encoding) - decoded = Excoding.decode(encoded, encoding) + {:ok, encoded} = Excoding.encode(original, encoding) + {:ok, decoded} = Excoding.decode(encoded, encoding) assert decoded == original, "Roundtrip failed for #{encoding}" end end @@ -138,14 +138,14 @@ defmodule ExcodingTest do test "encode then decode for Asian encodings" do # Japanese text roundtrip through Shift_JIS original = "こんにちは" - encoded = Excoding.encode(original, "shift_jis") - decoded = Excoding.decode(encoded, "shift_jis") + {:ok, encoded} = Excoding.encode(original, "shift_jis") + {:ok, decoded} = Excoding.decode(encoded, "shift_jis") assert decoded == original # Korean text roundtrip through EUC-KR original_kr = "안녕하세요" - encoded_kr = Excoding.encode(original_kr, "euc-kr") - decoded_kr = Excoding.decode(encoded_kr, "euc-kr") + {:ok, encoded_kr} = Excoding.encode(original_kr, "euc-kr") + {:ok, decoded_kr} = Excoding.decode(encoded_kr, "euc-kr") assert decoded_kr == original_kr end end From 8a9ede78992b0bc9db9cee7c45bfdb26edc706b4 Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:09:59 -0800 Subject: [PATCH 03/15] Update CI and point precompiled binaries to fork - Update base_url to jeffhuen/excoding for precompiled binaries - Modernize GitHub Actions workflow (ubuntu-22.04, macos-14, actions v4) - Add CI workflow for tests across Elixir 1.15-1.18 and OTP 26-27 - Add lint checks for Elixir and Rust formatting Co-Authored-By: Claude Opus 4.5 --- .github/workflows/ci.yml | 108 ++++++++++++++++++++++ .github/workflows/rustler_precompiled.yml | 91 +++++++++--------- lib/excoding/native.ex | 2 +- 3 files changed, 156 insertions(+), 45 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..5d9168a --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,108 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +env: + MIX_ENV: test + RUSTLER_PRECOMPILATION_EXCODING_BUILD: "true" + +jobs: + test: + name: Test (Elixir ${{ matrix.elixir }} / OTP ${{ matrix.otp }}) + runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + include: + - elixir: "1.15" + otp: "26" + - elixir: "1.16" + otp: "26" + - elixir: "1.17" + otp: "27" + - elixir: "1.18" + otp: "27" + + steps: + - uses: actions/checkout@v4 + + - name: Set up Elixir + uses: erlef/setup-beam@v1 + with: + elixir-version: ${{ matrix.elixir }} + otp-version: ${{ matrix.otp }} + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache deps + uses: actions/cache@v4 + with: + path: deps + key: deps-${{ runner.os }}-${{ matrix.otp }}-${{ matrix.elixir }}-${{ hashFiles('**/mix.lock') }} + restore-keys: deps-${{ runner.os }}-${{ matrix.otp }}-${{ matrix.elixir }}- + + - name: Cache _build + uses: actions/cache@v4 + with: + path: _build + key: build-${{ runner.os }}-${{ matrix.otp }}-${{ matrix.elixir }}-${{ hashFiles('**/mix.lock') }} + restore-keys: build-${{ runner.os }}-${{ matrix.otp }}-${{ matrix.elixir }}- + + - name: Cache Rust target + uses: actions/cache@v4 + with: + path: native/excoding/target + key: rust-${{ runner.os }}-${{ hashFiles('native/excoding/Cargo.lock') }} + restore-keys: rust-${{ runner.os }}- + + - name: Install dependencies + run: mix deps.get + + - name: Compile + run: mix compile --warnings-as-errors + + - name: Run tests + run: mix test + + lint: + name: Lint + runs-on: ubuntu-22.04 + env: + RUSTLER_PRECOMPILATION_EXCODING_BUILD: "true" + + steps: + - uses: actions/checkout@v4 + + - name: Set up Elixir + uses: erlef/setup-beam@v1 + with: + elixir-version: "1.17" + otp-version: "27" + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt, clippy + + - name: Cache deps + uses: actions/cache@v4 + with: + path: deps + key: deps-lint-${{ runner.os }}-${{ hashFiles('**/mix.lock') }} + + - name: Install dependencies + run: mix deps.get + + - name: Check formatting (Elixir) + run: mix format --check-formatted + + - name: Check formatting (Rust) + run: cargo fmt --manifest-path native/excoding/Cargo.toml -- --check + + - name: Clippy (Rust) + run: cargo clippy --manifest-path native/excoding/Cargo.toml -- -D warnings diff --git a/.github/workflows/rustler_precompiled.yml b/.github/workflows/rustler_precompiled.yml index d6b9c66..ed4a13e 100644 --- a/.github/workflows/rustler_precompiled.yml +++ b/.github/workflows/rustler_precompiled.yml @@ -3,7 +3,10 @@ name: Build precompiled NIFs on: push: tags: - - '*' + - 'v*' + +permissions: + contents: write jobs: build_release: @@ -14,52 +17,52 @@ jobs: matrix: nif: ["2.16", "2.15"] job: - - { target: arm-unknown-linux-gnueabihf , os: ubuntu-20.04 , use-cross: true } - - { target: aarch64-unknown-linux-gnu , os: ubuntu-20.04 , use-cross: true } - - { target: aarch64-unknown-linux-musl , os: ubuntu-20.04 , use-cross: true } - - { target: aarch64-apple-darwin , os: macos-11 } - - { target: riscv64gc-unknown-linux-gnu , os: ubuntu-20.04 , use-cross: true } - - { target: x86_64-apple-darwin , os: macos-11 } - - { target: x86_64-unknown-linux-gnu , os: ubuntu-20.04 } - - { target: x86_64-unknown-linux-musl , os: ubuntu-20.04 , use-cross: true } - - { target: x86_64-pc-windows-gnu , os: windows-2019 } - - { target: x86_64-pc-windows-msvc , os: windows-2019 } + - { target: arm-unknown-linux-gnueabihf , os: ubuntu-22.04 , use-cross: true } + - { target: aarch64-unknown-linux-gnu , os: ubuntu-22.04 , use-cross: true } + - { target: aarch64-unknown-linux-musl , os: ubuntu-22.04 , use-cross: true } + - { target: aarch64-apple-darwin , os: macos-14 } + - { target: riscv64gc-unknown-linux-gnu , os: ubuntu-22.04 , use-cross: true } + - { target: x86_64-apple-darwin , os: macos-13 } + - { target: x86_64-unknown-linux-gnu , os: ubuntu-22.04 } + - { target: x86_64-unknown-linux-musl , os: ubuntu-22.04 , use-cross: true } + - { target: x86_64-pc-windows-gnu , os: windows-2022 } + - { target: x86_64-pc-windows-msvc , os: windows-2022 } steps: - - name: Checkout source code - uses: actions/checkout@v3 + - name: Checkout source code + uses: actions/checkout@v4 + + - name: Extract project version + shell: bash + run: | + echo "PROJECT_VERSION=$(sed -n 's/^ @version "\(.*\)"/\1/p' mix.exs | head -n1)" >> $GITHUB_ENV - - name: Extract project version - shell: bash - run: | - # Get the project version from mix.exs - echo "PROJECT_VERSION=$(sed -n 's/^ @version "\(.*\)"/\1/p' mix.exs | head -n1)" >> $GITHUB_ENV - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable - with: - toolchain: stable - target: ${{ matrix.job.target }} + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + targets: ${{ matrix.job.target }} - - name: Build the project - id: build-crate - uses: philss/rustler-precompiled-action@v1.0.0 - with: - project-name: excoding - project-version: ${{ env.PROJECT_VERSION }} - target: ${{ matrix.job.target }} - nif-version: ${{ matrix.nif }} - use-cross: ${{ matrix.job.use-cross }} - project-dir: "native/excoding" + - name: Build the project + id: build-crate + uses: philss/rustler-precompiled-action@v1.1.4 + with: + project-name: excoding + project-version: ${{ env.PROJECT_VERSION }} + target: ${{ matrix.job.target }} + nif-version: ${{ matrix.nif }} + use-cross: ${{ matrix.job.use-cross }} + project-dir: "native/excoding" - - name: Artifact upload - uses: actions/upload-artifact@v3 - with: - name: ${{ steps.build-crate.outputs.file-name }} - path: ${{ steps.build-crate.outputs.file-path }} + - name: Artifact upload + uses: actions/upload-artifact@v4 + with: + name: ${{ steps.build-crate.outputs.file-name }} + path: ${{ steps.build-crate.outputs.file-path }} - - name: Publish archives and packages - uses: softprops/action-gh-release@v1 - with: - files: | - ${{ steps.build-crate.outputs.file-path }} - if: startsWith(github.ref, 'refs/tags/') + - name: Publish archives and packages + uses: softprops/action-gh-release@v2 + with: + files: | + ${{ steps.build-crate.outputs.file-path }} + if: startsWith(github.ref, 'refs/tags/') diff --git a/lib/excoding/native.ex b/lib/excoding/native.ex index c5265e7..5ca58b0 100644 --- a/lib/excoding/native.ex +++ b/lib/excoding/native.ex @@ -7,7 +7,7 @@ defmodule Excoding.Native do use RustlerPrecompiled, otp_app: :excoding, crate: "excoding", - base_url: "https://github.com/elixir-ecto/excoding/releases/download/v#{version}", + base_url: "https://github.com/jeffhuen/excoding/releases/download/v#{version}", force_build: System.get_env("RUSTLER_PRECOMPILATION_EXCODING_BUILD") in ["1", "true"], mode: if(Mix.env() == :prod, do: :release, else: :debug), targets: From 05b7b0919733a7c3e3506f83fe2368ed111c52be Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:13:51 -0800 Subject: [PATCH 04/15] Fix NIF precompilation for Rustler 0.30+ - Add NIF version features to Cargo.toml (required since Rustler 0.30.0) - Update macOS x86_64 runner from macos-13 to macos-15 Co-Authored-By: Claude Opus 4.5 --- .github/workflows/rustler_precompiled.yml | 2 +- native/excoding/Cargo.toml | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rustler_precompiled.yml b/.github/workflows/rustler_precompiled.yml index ed4a13e..884ac1b 100644 --- a/.github/workflows/rustler_precompiled.yml +++ b/.github/workflows/rustler_precompiled.yml @@ -22,7 +22,7 @@ jobs: - { target: aarch64-unknown-linux-musl , os: ubuntu-22.04 , use-cross: true } - { target: aarch64-apple-darwin , os: macos-14 } - { target: riscv64gc-unknown-linux-gnu , os: ubuntu-22.04 , use-cross: true } - - { target: x86_64-apple-darwin , os: macos-13 } + - { target: x86_64-apple-darwin , os: macos-15 } - { target: x86_64-unknown-linux-gnu , os: ubuntu-22.04 } - { target: x86_64-unknown-linux-musl , os: ubuntu-22.04 , use-cross: true } - { target: x86_64-pc-windows-gnu , os: windows-2022 } diff --git a/native/excoding/Cargo.toml b/native/excoding/Cargo.toml index e6e8ee2..3afea08 100644 --- a/native/excoding/Cargo.toml +++ b/native/excoding/Cargo.toml @@ -12,3 +12,11 @@ crate-type = ["cdylib"] [dependencies] rustler = "0.37" encoding_rs = "0.8" + +[features] +default = [] +# NIF version features for rustler_precompiled +nif_version_2_14 = ["rustler/nif_version_2_14"] +nif_version_2_15 = ["rustler/nif_version_2_15"] +nif_version_2_16 = ["rustler/nif_version_2_16"] +nif_version_2_17 = ["rustler/nif_version_2_17"] From a5a081ca669842b414e643200f7f0691cbe2adfb Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:21:48 -0800 Subject: [PATCH 05/15] Add checksum file for precompiled NIF binaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove checksum-* from .gitignore (checksum file must be committed) - Add checksums for all 20 precompiled binaries (10 targets × 2 NIF versions) Co-Authored-By: Claude Opus 4.5 --- .gitignore | 2 -- checksum-Elixir.Excoding.Native.exs | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 checksum-Elixir.Excoding.Native.exs diff --git a/.gitignore b/.gitignore index a43ddc3..591e074 100644 --- a/.gitignore +++ b/.gitignore @@ -28,5 +28,3 @@ excoding-*.tar /priv/native/ /native/*/target - -checksum-* diff --git a/checksum-Elixir.Excoding.Native.exs b/checksum-Elixir.Excoding.Native.exs new file mode 100644 index 0000000..c660094 --- /dev/null +++ b/checksum-Elixir.Excoding.Native.exs @@ -0,0 +1,22 @@ +%{ + "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:bad90fe5f3917c5f3eb239e40bde029aa4f97700d158d6a46601039a39eaa469", + "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:f65cbcda2e1cbcaeb578cb8ad9969f77a8bd13d1333f608c2b4c2d57efcd1f29", + "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:5106e7ed22053a2387bec9a1f5a18b4da38a0869df6d7c52b0bf4190b0d64b38", + "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:fc46a34438b53c2deb0c20892e1f0f1251fa75376820d848714715e2d5dff948", + "libexcoding-v0.2.0-nif-2.15-aarch64-apple-darwin.so.tar.gz" => "sha256:b4017a503b6f317cf382c1823a0e0c09fe1b6f2744e9d137bc93141a390a2f96", + "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:13dade3c0b700776e96b49a7a9ac25ea64f5c883ea96886417f339d17e1e9bfe", + "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:df98f29f3ab12c3f81777348eca66b29843581fd2fb627bec498ac142fa4ca6a", + "libexcoding-v0.2.0-nif-2.15-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:36119f88dec13d045ae4a1626f58b3b5cc97dd831c8abf9ed6b581d171ba0b55", + "libexcoding-v0.2.0-nif-2.15-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:237cd84a71d1788ceab8a39a713a7c0e905793581545506a04d52bbbcb6cf5c4", + "libexcoding-v0.2.0-nif-2.15-x86_64-apple-darwin.so.tar.gz" => "sha256:a70b409a7c11c3b078cde689e6794a5f6d20a83b9ed986d9452e93d09013f4aa", + "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:2a6928682b0b2edb2e759b24b74c54b353091ad3f9171337fc4aac82e20a4802", + "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:38b4a312d3adb55237716b1cbc977b40dcb1348aaa82eef846bae2a2050bb1de", + "libexcoding-v0.2.0-nif-2.16-aarch64-apple-darwin.so.tar.gz" => "sha256:280a57c0889fbd54a14faf0a341470da61f1e1d5d122004c9ab9cad32339541f", + "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:ab4b0b5be643721e1743a467880f9e331aef262c55ae86db501406aa6ed7f28e", + "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:b74493213cba04bb8a24dad40b8864868225ca4ce3c2a14fed7295b94da1fd6e", + "libexcoding-v0.2.0-nif-2.16-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:3476cbc8c7b2dd87b68c29935180b6bbc7ae715ef45ab8ee4aa1abaa92becb94", + "libexcoding-v0.2.0-nif-2.16-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:a4acdc597bf3a45030654b945b2970f23b48f13df26e1f4815bcc2fcdfd4b01b", + "libexcoding-v0.2.0-nif-2.16-x86_64-apple-darwin.so.tar.gz" => "sha256:e442044e5c18ec44ffc9d106e015ddcd5527523572e705ce740a94e991b72cb6", + "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:8b57891fc01bb09bf935ec53622179a78eb04b812c22b781a6ef31bf5f874b6d", + "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:4559e94b3dad0137630aa8c9bd759768591e4c64f2d793624825cf0cfdf3e06f", +} From 6a7eba2e4c578822d33145070426768ded22a837 Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:22:56 -0800 Subject: [PATCH 06/15] Add NIF 2.17 support for OTP 27-28 Co-Authored-By: Claude Opus 4.5 --- .github/workflows/rustler_precompiled.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rustler_precompiled.yml b/.github/workflows/rustler_precompiled.yml index 884ac1b..2b748e0 100644 --- a/.github/workflows/rustler_precompiled.yml +++ b/.github/workflows/rustler_precompiled.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - nif: ["2.16", "2.15"] + nif: ["2.17", "2.16", "2.15"] job: - { target: arm-unknown-linux-gnueabihf , os: ubuntu-22.04 , use-cross: true } - { target: aarch64-unknown-linux-gnu , os: ubuntu-22.04 , use-cross: true } From 9ed2dc9bf6698a53e6175943e83d6048867122ab Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:28:39 -0800 Subject: [PATCH 07/15] Add NIF 2.17 checksums for OTP 27-28 --- checksum-Elixir.Excoding.Native.exs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/checksum-Elixir.Excoding.Native.exs b/checksum-Elixir.Excoding.Native.exs index c660094..e28c900 100644 --- a/checksum-Elixir.Excoding.Native.exs +++ b/checksum-Elixir.Excoding.Native.exs @@ -3,6 +3,8 @@ "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:f65cbcda2e1cbcaeb578cb8ad9969f77a8bd13d1333f608c2b4c2d57efcd1f29", "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:5106e7ed22053a2387bec9a1f5a18b4da38a0869df6d7c52b0bf4190b0d64b38", "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:fc46a34438b53c2deb0c20892e1f0f1251fa75376820d848714715e2d5dff948", + "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:0f2e9b53e76f3b9479d6bcbec9151658a52651833c1b27c1fb604078b78aa451", + "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:8eb7221d7f3cf52e4c79f7624d41b6ba1d935ada185baeeb8a551528129f88bc", "libexcoding-v0.2.0-nif-2.15-aarch64-apple-darwin.so.tar.gz" => "sha256:b4017a503b6f317cf382c1823a0e0c09fe1b6f2744e9d137bc93141a390a2f96", "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:13dade3c0b700776e96b49a7a9ac25ea64f5c883ea96886417f339d17e1e9bfe", "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:df98f29f3ab12c3f81777348eca66b29843581fd2fb627bec498ac142fa4ca6a", @@ -19,4 +21,12 @@ "libexcoding-v0.2.0-nif-2.16-x86_64-apple-darwin.so.tar.gz" => "sha256:e442044e5c18ec44ffc9d106e015ddcd5527523572e705ce740a94e991b72cb6", "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:8b57891fc01bb09bf935ec53622179a78eb04b812c22b781a6ef31bf5f874b6d", "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:4559e94b3dad0137630aa8c9bd759768591e4c64f2d793624825cf0cfdf3e06f", + "libexcoding-v0.2.0-nif-2.17-aarch64-apple-darwin.so.tar.gz" => "sha256:24eb39dd54130af0e64d44137de4f22c9e17cf6eed7ee6026fe0a0989b0a3049", + "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:96049c0d680493e5dcac6e9386f4210af5cae7b82c3a4972e13baa93d1b65d5f", + "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:f2d473b107358adb25514c5130b0ad699409ac895c08e51b1b64142f707d00df", + "libexcoding-v0.2.0-nif-2.17-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:43349908044bb4c49475bbbf9f6ac07ede297ece51cbfe050f7004ab5abb2f49", + "libexcoding-v0.2.0-nif-2.17-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:5cf3f5ad914c26ba9bb2c4255a1703392b1a41eb81302ae4b7284092388f5e2c", + "libexcoding-v0.2.0-nif-2.17-x86_64-apple-darwin.so.tar.gz" => "sha256:ffde4bdf2b5fb4e2ebcbfe14d36362121fb55a34cb3f11fbcc1a7cfb3b457628", + "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:c94cfe078f94f4501ff1c454664dbd8c00ffa977a1d8f9d4648cce1ea822c562", + "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:73d525ac9d69060c4a8b6e9582d63996671a53425077079ac158130367a13381", } From b584a2ac70c72b8d8861449b6098bdf8fa907cc3 Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:36:45 -0800 Subject: [PATCH 08/15] Add Elixir 1.19 / OTP 28 to CI matrix --- .github/workflows/ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5d9168a..aa4b7a8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,6 +26,8 @@ jobs: otp: "27" - elixir: "1.18" otp: "27" + - elixir: "1.19" + otp: "28" steps: - uses: actions/checkout@v4 @@ -81,8 +83,8 @@ jobs: - name: Set up Elixir uses: erlef/setup-beam@v1 with: - elixir-version: "1.17" - otp-version: "27" + elixir-version: "1.19" + otp-version: "28" - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable From 42c3d03af6c19a400d636f535bc7ff451384954c Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:40:10 -0800 Subject: [PATCH 09/15] Add *.local.md to gitignore for local notes --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 591e074..14a2c39 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,6 @@ excoding-*.tar /priv/native/ /native/*/target + +# Local notes (not committed) +*.local.md From d727c92dd52f0d99fb56c537c7ef3efd7619341c Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 13:02:59 -0800 Subject: [PATCH 10/15] Fix checksum file to match v0.2.0 release binaries The checksums were generated before the final release binaries were uploaded. This updates them to match the actual release artifacts. Co-Authored-By: Claude Opus 4.5 --- checksum-Elixir.Excoding.Native.exs | 60 ++++++++++++++--------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/checksum-Elixir.Excoding.Native.exs b/checksum-Elixir.Excoding.Native.exs index e28c900..17ec800 100644 --- a/checksum-Elixir.Excoding.Native.exs +++ b/checksum-Elixir.Excoding.Native.exs @@ -1,32 +1,32 @@ %{ - "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:bad90fe5f3917c5f3eb239e40bde029aa4f97700d158d6a46601039a39eaa469", - "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:f65cbcda2e1cbcaeb578cb8ad9969f77a8bd13d1333f608c2b4c2d57efcd1f29", - "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:5106e7ed22053a2387bec9a1f5a18b4da38a0869df6d7c52b0bf4190b0d64b38", - "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:fc46a34438b53c2deb0c20892e1f0f1251fa75376820d848714715e2d5dff948", - "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:0f2e9b53e76f3b9479d6bcbec9151658a52651833c1b27c1fb604078b78aa451", - "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:8eb7221d7f3cf52e4c79f7624d41b6ba1d935ada185baeeb8a551528129f88bc", - "libexcoding-v0.2.0-nif-2.15-aarch64-apple-darwin.so.tar.gz" => "sha256:b4017a503b6f317cf382c1823a0e0c09fe1b6f2744e9d137bc93141a390a2f96", - "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:13dade3c0b700776e96b49a7a9ac25ea64f5c883ea96886417f339d17e1e9bfe", - "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:df98f29f3ab12c3f81777348eca66b29843581fd2fb627bec498ac142fa4ca6a", - "libexcoding-v0.2.0-nif-2.15-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:36119f88dec13d045ae4a1626f58b3b5cc97dd831c8abf9ed6b581d171ba0b55", - "libexcoding-v0.2.0-nif-2.15-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:237cd84a71d1788ceab8a39a713a7c0e905793581545506a04d52bbbcb6cf5c4", - "libexcoding-v0.2.0-nif-2.15-x86_64-apple-darwin.so.tar.gz" => "sha256:a70b409a7c11c3b078cde689e6794a5f6d20a83b9ed986d9452e93d09013f4aa", - "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:2a6928682b0b2edb2e759b24b74c54b353091ad3f9171337fc4aac82e20a4802", - "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:38b4a312d3adb55237716b1cbc977b40dcb1348aaa82eef846bae2a2050bb1de", - "libexcoding-v0.2.0-nif-2.16-aarch64-apple-darwin.so.tar.gz" => "sha256:280a57c0889fbd54a14faf0a341470da61f1e1d5d122004c9ab9cad32339541f", - "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:ab4b0b5be643721e1743a467880f9e331aef262c55ae86db501406aa6ed7f28e", - "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:b74493213cba04bb8a24dad40b8864868225ca4ce3c2a14fed7295b94da1fd6e", - "libexcoding-v0.2.0-nif-2.16-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:3476cbc8c7b2dd87b68c29935180b6bbc7ae715ef45ab8ee4aa1abaa92becb94", - "libexcoding-v0.2.0-nif-2.16-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:a4acdc597bf3a45030654b945b2970f23b48f13df26e1f4815bcc2fcdfd4b01b", - "libexcoding-v0.2.0-nif-2.16-x86_64-apple-darwin.so.tar.gz" => "sha256:e442044e5c18ec44ffc9d106e015ddcd5527523572e705ce740a94e991b72cb6", - "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:8b57891fc01bb09bf935ec53622179a78eb04b812c22b781a6ef31bf5f874b6d", - "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:4559e94b3dad0137630aa8c9bd759768591e4c64f2d793624825cf0cfdf3e06f", - "libexcoding-v0.2.0-nif-2.17-aarch64-apple-darwin.so.tar.gz" => "sha256:24eb39dd54130af0e64d44137de4f22c9e17cf6eed7ee6026fe0a0989b0a3049", - "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:96049c0d680493e5dcac6e9386f4210af5cae7b82c3a4972e13baa93d1b65d5f", - "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:f2d473b107358adb25514c5130b0ad699409ac895c08e51b1b64142f707d00df", - "libexcoding-v0.2.0-nif-2.17-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:43349908044bb4c49475bbbf9f6ac07ede297ece51cbfe050f7004ab5abb2f49", - "libexcoding-v0.2.0-nif-2.17-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:5cf3f5ad914c26ba9bb2c4255a1703392b1a41eb81302ae4b7284092388f5e2c", - "libexcoding-v0.2.0-nif-2.17-x86_64-apple-darwin.so.tar.gz" => "sha256:ffde4bdf2b5fb4e2ebcbfe14d36362121fb55a34cb3f11fbcc1a7cfb3b457628", - "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:c94cfe078f94f4501ff1c454664dbd8c00ffa977a1d8f9d4648cce1ea822c562", - "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:73d525ac9d69060c4a8b6e9582d63996671a53425077079ac158130367a13381", + "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:ae170f935e4dff1d60470656aefcf8c5ff360a72939a5cc60400a5a319cec825", + "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:9c91b0b284e0dd15300db04c129980cf7ca2fabaa612d7401cb8ca3141389338", + "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:a8ccd6014d73cd721fa7b8a07a58e115601aea55a881e166d2c87fede3f092a7", + "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:036e37b0526a698242aaa6c1582adff22cd318a152e38e603a72f8a32e2aa6d5", + "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:ddaf0f2fca1e7e74051cd10485a12a67d6347b18d3f7afb316a9dc845529fde1", + "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:fbca01e81304e299b1bd0f0dabd8b88d8e8d25527cac461ae90e4cb4646e740d", + "libexcoding-v0.2.0-nif-2.15-aarch64-apple-darwin.so.tar.gz" => "sha256:d27d156c30d337fa9f95e5beb10005f83d38916376b2a0ab3e5fdb07f28d6af2", + "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:02bb8b82b8c4a3d9650ca8dc54588bd7c9602266397b950337d666c1a863f732", + "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:9a0c6de1a4e45a0ff6d5b67e39fe04447060bc41613a993662c764c92bac4723", + "libexcoding-v0.2.0-nif-2.15-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:02bfa4d8b15d941bb6d1efc03d1d7ca90c833581059b448fb1b83955eea497f5", + "libexcoding-v0.2.0-nif-2.15-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:e664b0e17ac0e98c76a5580e0866840540491bdb0f52b15742123c18c5839754", + "libexcoding-v0.2.0-nif-2.15-x86_64-apple-darwin.so.tar.gz" => "sha256:3f0192bbaf9facdfb60c2f57dd1f6a33a496a25b1d576d90eeb767716d1673da", + "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:7b45334b766f43761cfe6de2ce2b1c325576ec6597462921e45fee4ff1351c92", + "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:d7f0125d718898257a06e418a094cbaf901371c47d0eea24e7131131d11971b5", + "libexcoding-v0.2.0-nif-2.16-aarch64-apple-darwin.so.tar.gz" => "sha256:4a8e0fc7780baa0045d7854930fa353cbe3596dbb687cfe38a6fd9d3d291b625", + "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:89cce0ec70ee854ed6ad21b68356bcbbc913e3d445a50ed69c43e1e4251b527c", + "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:34bc71ca66c1482346e2af23efa583d1855b33496934dbec72677718c419e9c4", + "libexcoding-v0.2.0-nif-2.16-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:acf911fdbf907da1b09742af515a422fd971a8c9b3af985d84c316b52ddfd286", + "libexcoding-v0.2.0-nif-2.16-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:866933c694a242382e9caf9332945bd60757cb4b74b478957bedfc191647b2c3", + "libexcoding-v0.2.0-nif-2.16-x86_64-apple-darwin.so.tar.gz" => "sha256:25711e7aee9dbaa0d6b38c5c750dce3e6cfa52f41f81165a861060594784d93c", + "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:41c6cde813f0ee828aa3b6e9cc1ff49a7f086d8fe186f7d817d83889dc5eb9e6", + "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:004c2013ea03a5616fa074440ecf251bdc98961c9973953ef4e7b46e85ff3abc", + "libexcoding-v0.2.0-nif-2.17-aarch64-apple-darwin.so.tar.gz" => "sha256:7a97a59806d9626c9835d2662ce60b26bcf1d9dc00b916a376f38d6a7ee686d8", + "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:372296b0d5d7ce67b5c87ac5e6108e06f69ebaf4f9e9ab4a190f3e083ecd35b8", + "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:d72416673e0b1c8ebd1be2381c9f52c691a2ace0169d4e128387e6ddce8b6464", + "libexcoding-v0.2.0-nif-2.17-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:e81d8406275614edba91e8fb14fe389b1a1ae6d171344c7119f8e445fc85a991", + "libexcoding-v0.2.0-nif-2.17-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:4b86a0be6a65e706502b5924610b878f1350f097f4d012f4dd015c5c08fb716d", + "libexcoding-v0.2.0-nif-2.17-x86_64-apple-darwin.so.tar.gz" => "sha256:dda294f69b12914424e527549bbf42b0778be311a0e0656192773cd01cc2e7f6", + "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:843b9ef16af3a566e122e8f6ea255086f75d0d700a92784d0c22c74e668acfae", + "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:ee7d18326c942dbcee58c198e02abdf39bc4f49a69498d56b10a91520af4c34e", } From 7a64e88accb845eb736afc0f84d329d664c9efc9 Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 13:11:37 -0800 Subject: [PATCH 11/15] Fix checksums with actual sha256 of release tarballs GitHub API digest field is NOT the sha256 of the tar.gz file. Computed checksums by downloading each file and running shasum -a 256. Co-Authored-By: Claude Opus 4.5 --- checksum-Elixir.Excoding.Native.exs | 60 ++++++++++++++--------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/checksum-Elixir.Excoding.Native.exs b/checksum-Elixir.Excoding.Native.exs index 17ec800..c302bee 100644 --- a/checksum-Elixir.Excoding.Native.exs +++ b/checksum-Elixir.Excoding.Native.exs @@ -1,32 +1,32 @@ %{ - "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:ae170f935e4dff1d60470656aefcf8c5ff360a72939a5cc60400a5a319cec825", - "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:9c91b0b284e0dd15300db04c129980cf7ca2fabaa612d7401cb8ca3141389338", - "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:a8ccd6014d73cd721fa7b8a07a58e115601aea55a881e166d2c87fede3f092a7", - "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:036e37b0526a698242aaa6c1582adff22cd318a152e38e603a72f8a32e2aa6d5", - "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:ddaf0f2fca1e7e74051cd10485a12a67d6347b18d3f7afb316a9dc845529fde1", - "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:fbca01e81304e299b1bd0f0dabd8b88d8e8d25527cac461ae90e4cb4646e740d", - "libexcoding-v0.2.0-nif-2.15-aarch64-apple-darwin.so.tar.gz" => "sha256:d27d156c30d337fa9f95e5beb10005f83d38916376b2a0ab3e5fdb07f28d6af2", - "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:02bb8b82b8c4a3d9650ca8dc54588bd7c9602266397b950337d666c1a863f732", - "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:9a0c6de1a4e45a0ff6d5b67e39fe04447060bc41613a993662c764c92bac4723", - "libexcoding-v0.2.0-nif-2.15-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:02bfa4d8b15d941bb6d1efc03d1d7ca90c833581059b448fb1b83955eea497f5", - "libexcoding-v0.2.0-nif-2.15-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:e664b0e17ac0e98c76a5580e0866840540491bdb0f52b15742123c18c5839754", - "libexcoding-v0.2.0-nif-2.15-x86_64-apple-darwin.so.tar.gz" => "sha256:3f0192bbaf9facdfb60c2f57dd1f6a33a496a25b1d576d90eeb767716d1673da", - "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:7b45334b766f43761cfe6de2ce2b1c325576ec6597462921e45fee4ff1351c92", - "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:d7f0125d718898257a06e418a094cbaf901371c47d0eea24e7131131d11971b5", - "libexcoding-v0.2.0-nif-2.16-aarch64-apple-darwin.so.tar.gz" => "sha256:4a8e0fc7780baa0045d7854930fa353cbe3596dbb687cfe38a6fd9d3d291b625", - "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:89cce0ec70ee854ed6ad21b68356bcbbc913e3d445a50ed69c43e1e4251b527c", - "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:34bc71ca66c1482346e2af23efa583d1855b33496934dbec72677718c419e9c4", - "libexcoding-v0.2.0-nif-2.16-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:acf911fdbf907da1b09742af515a422fd971a8c9b3af985d84c316b52ddfd286", - "libexcoding-v0.2.0-nif-2.16-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:866933c694a242382e9caf9332945bd60757cb4b74b478957bedfc191647b2c3", - "libexcoding-v0.2.0-nif-2.16-x86_64-apple-darwin.so.tar.gz" => "sha256:25711e7aee9dbaa0d6b38c5c750dce3e6cfa52f41f81165a861060594784d93c", - "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:41c6cde813f0ee828aa3b6e9cc1ff49a7f086d8fe186f7d817d83889dc5eb9e6", - "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:004c2013ea03a5616fa074440ecf251bdc98961c9973953ef4e7b46e85ff3abc", - "libexcoding-v0.2.0-nif-2.17-aarch64-apple-darwin.so.tar.gz" => "sha256:7a97a59806d9626c9835d2662ce60b26bcf1d9dc00b916a376f38d6a7ee686d8", - "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:372296b0d5d7ce67b5c87ac5e6108e06f69ebaf4f9e9ab4a190f3e083ecd35b8", - "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:d72416673e0b1c8ebd1be2381c9f52c691a2ace0169d4e128387e6ddce8b6464", - "libexcoding-v0.2.0-nif-2.17-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:e81d8406275614edba91e8fb14fe389b1a1ae6d171344c7119f8e445fc85a991", - "libexcoding-v0.2.0-nif-2.17-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:4b86a0be6a65e706502b5924610b878f1350f097f4d012f4dd015c5c08fb716d", - "libexcoding-v0.2.0-nif-2.17-x86_64-apple-darwin.so.tar.gz" => "sha256:dda294f69b12914424e527549bbf42b0778be311a0e0656192773cd01cc2e7f6", - "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:843b9ef16af3a566e122e8f6ea255086f75d0d700a92784d0c22c74e668acfae", - "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:ee7d18326c942dbcee58c198e02abdf39bc4f49a69498d56b10a91520af4c34e", + "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:cf07d357e8aabb0f88c846b89ce777cdd7cbdb01983e2b59366ba79a2069c0de", + "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:c8ae1a2cc864e89de662bad309ae6b2d50f4341a4ba37a02f7e9285f2aea6e77", + "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:62ceddbc40466a8ab3fc45d9679194515a0f39e09adbd2757d133c2f136498f3", + "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:71196998fbaad1d3e0adb3305292093291ba03620390ad475618f3fb30890aba", + "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:8c5388d77467e0ac741d0323508aa23a978d675f91ab3b6cb868a371b3b2ab28", + "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:9c58457b78a20928b51de4531fe9626ba0b3a9af0ecab150bf1ba496be5469a1", + "libexcoding-v0.2.0-nif-2.15-aarch64-apple-darwin.so.tar.gz" => "sha256:0d6bac520e5e991d57bfe3e5ff4ac683334ac77abfe16a2016b39be58d15e9de", + "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:cd606cfc82f2d046125327c74172ced93e629fb828b19fa4c547e5c2a9912ff4", + "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:5902e24f375584f78b1c5c97d36f5e326f9c1ecebb36ddf0afd154369005e528", + "libexcoding-v0.2.0-nif-2.15-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:9efb6f33e99d37286c187adffb89044fb5ed7b5e6de49e5224895bd310022184", + "libexcoding-v0.2.0-nif-2.15-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:514407f6d82bf34341444562c84dd49d0dab564e28448344614a6b2b45f1634e", + "libexcoding-v0.2.0-nif-2.15-x86_64-apple-darwin.so.tar.gz" => "sha256:9c0ec78635cc220f21f85f5b87896b33b60684b3a068e5da162892200a4272bf", + "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:58a3f40a71f5e724f29eca5c24e3dd86fb0430b38bd5550251ce8e2ec1c1a749", + "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:3c33b07b1c423fddd53906c2cb307bd8908d3a55355c6d687d7d9dca3298dacd", + "libexcoding-v0.2.0-nif-2.16-aarch64-apple-darwin.so.tar.gz" => "sha256:3d9ab4fa32f1ea2289344d4d37eea3f1a4b74b00c5bb61673a109b17a885021d", + "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:b1cf49519760283a9b9a7705928f1715af77825cde5cc3de12fddb33f9270c21", + "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:ac3457a1c21f8374496b8ca42625af3b326fb70c02bb27183e21c26ab9ad9f54", + "libexcoding-v0.2.0-nif-2.16-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:92b5dbdf8322d3f00c49983b489405ee898d7b27e7fa00ebf13e30baf7e79287", + "libexcoding-v0.2.0-nif-2.16-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:f137a900ff3fc9ec52e54ee6cd5c41d5945da70c833344c7672ab0b144ab035e", + "libexcoding-v0.2.0-nif-2.16-x86_64-apple-darwin.so.tar.gz" => "sha256:5ef49bc7673de7dd1bbcdbe78f3833ee0596d7ef2adbcacf566316f44993747c", + "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:6c3502d2ec5b9c4908bbbc56a7337a27b7f62889b2369d078779f22c694922e7", + "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:8fb1ae5e9cb012dbce98d8d323e31676ebd7ee127df2e948e4c713be4c25c8b7", + "libexcoding-v0.2.0-nif-2.17-aarch64-apple-darwin.so.tar.gz" => "sha256:dbb24f9f478a90dac15f2437ec02a1daceb4fad9a519d637a5f67feaaf5c4a97", + "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:b54e03e9c70c8e955e8fb55b9ede2b92f3751f2b1bfb43556255812d6888f2eb", + "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:988e6546c5405be48f590a0202e41f56024e401fa762f4ad5e8b171ede7dfa85", + "libexcoding-v0.2.0-nif-2.17-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:944e3204203c1132cf78cd58c9472fa7102f1f9e38ded191184cb3cf7ca86412", + "libexcoding-v0.2.0-nif-2.17-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:88ff8d61b558609d217a214de37c3bba717f1161808e69fa44d6439aa00668a2", + "libexcoding-v0.2.0-nif-2.17-x86_64-apple-darwin.so.tar.gz" => "sha256:df39551fa82bb58b512d37f6a9ed3c045a0f9a1c12423a5005c2f25a4692e8fb", + "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:863cbfb3de4fa5fe14f62f4f1b9eefa257974b3c3eec8f0d85cc979b3f8f6a36", + "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:573d6ce23e9f7f12ebddcc67517f2a3e0f2cf3b12208c419124b43fc44bfcb24", } From 595ef992892abc78c08df60d2c92fad566ab9857 Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 14:00:21 -0800 Subject: [PATCH 12/15] Add v0.2.0 checksums for all NIF versions (2.15, 2.16, 2.17) Generated from release binaries after CI build completed. DO NOT move the v0.2.0 tag - that would trigger a rebuild. Co-Authored-By: Claude Opus 4.5 --- checksum-Elixir.Excoding.Native.exs | 60 ++++++++++++++--------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/checksum-Elixir.Excoding.Native.exs b/checksum-Elixir.Excoding.Native.exs index c302bee..35d91c9 100644 --- a/checksum-Elixir.Excoding.Native.exs +++ b/checksum-Elixir.Excoding.Native.exs @@ -1,32 +1,32 @@ %{ - "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:cf07d357e8aabb0f88c846b89ce777cdd7cbdb01983e2b59366ba79a2069c0de", - "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:c8ae1a2cc864e89de662bad309ae6b2d50f4341a4ba37a02f7e9285f2aea6e77", - "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:62ceddbc40466a8ab3fc45d9679194515a0f39e09adbd2757d133c2f136498f3", - "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:71196998fbaad1d3e0adb3305292093291ba03620390ad475618f3fb30890aba", - "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:8c5388d77467e0ac741d0323508aa23a978d675f91ab3b6cb868a371b3b2ab28", - "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:9c58457b78a20928b51de4531fe9626ba0b3a9af0ecab150bf1ba496be5469a1", - "libexcoding-v0.2.0-nif-2.15-aarch64-apple-darwin.so.tar.gz" => "sha256:0d6bac520e5e991d57bfe3e5ff4ac683334ac77abfe16a2016b39be58d15e9de", - "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:cd606cfc82f2d046125327c74172ced93e629fb828b19fa4c547e5c2a9912ff4", - "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:5902e24f375584f78b1c5c97d36f5e326f9c1ecebb36ddf0afd154369005e528", - "libexcoding-v0.2.0-nif-2.15-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:9efb6f33e99d37286c187adffb89044fb5ed7b5e6de49e5224895bd310022184", - "libexcoding-v0.2.0-nif-2.15-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:514407f6d82bf34341444562c84dd49d0dab564e28448344614a6b2b45f1634e", - "libexcoding-v0.2.0-nif-2.15-x86_64-apple-darwin.so.tar.gz" => "sha256:9c0ec78635cc220f21f85f5b87896b33b60684b3a068e5da162892200a4272bf", - "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:58a3f40a71f5e724f29eca5c24e3dd86fb0430b38bd5550251ce8e2ec1c1a749", - "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:3c33b07b1c423fddd53906c2cb307bd8908d3a55355c6d687d7d9dca3298dacd", - "libexcoding-v0.2.0-nif-2.16-aarch64-apple-darwin.so.tar.gz" => "sha256:3d9ab4fa32f1ea2289344d4d37eea3f1a4b74b00c5bb61673a109b17a885021d", - "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:b1cf49519760283a9b9a7705928f1715af77825cde5cc3de12fddb33f9270c21", - "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:ac3457a1c21f8374496b8ca42625af3b326fb70c02bb27183e21c26ab9ad9f54", - "libexcoding-v0.2.0-nif-2.16-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:92b5dbdf8322d3f00c49983b489405ee898d7b27e7fa00ebf13e30baf7e79287", - "libexcoding-v0.2.0-nif-2.16-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:f137a900ff3fc9ec52e54ee6cd5c41d5945da70c833344c7672ab0b144ab035e", - "libexcoding-v0.2.0-nif-2.16-x86_64-apple-darwin.so.tar.gz" => "sha256:5ef49bc7673de7dd1bbcdbe78f3833ee0596d7ef2adbcacf566316f44993747c", - "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:6c3502d2ec5b9c4908bbbc56a7337a27b7f62889b2369d078779f22c694922e7", - "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:8fb1ae5e9cb012dbce98d8d323e31676ebd7ee127df2e948e4c713be4c25c8b7", - "libexcoding-v0.2.0-nif-2.17-aarch64-apple-darwin.so.tar.gz" => "sha256:dbb24f9f478a90dac15f2437ec02a1daceb4fad9a519d637a5f67feaaf5c4a97", - "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:b54e03e9c70c8e955e8fb55b9ede2b92f3751f2b1bfb43556255812d6888f2eb", - "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:988e6546c5405be48f590a0202e41f56024e401fa762f4ad5e8b171ede7dfa85", - "libexcoding-v0.2.0-nif-2.17-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:944e3204203c1132cf78cd58c9472fa7102f1f9e38ded191184cb3cf7ca86412", - "libexcoding-v0.2.0-nif-2.17-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:88ff8d61b558609d217a214de37c3bba717f1161808e69fa44d6439aa00668a2", - "libexcoding-v0.2.0-nif-2.17-x86_64-apple-darwin.so.tar.gz" => "sha256:df39551fa82bb58b512d37f6a9ed3c045a0f9a1c12423a5005c2f25a4692e8fb", - "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:863cbfb3de4fa5fe14f62f4f1b9eefa257974b3c3eec8f0d85cc979b3f8f6a36", - "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:573d6ce23e9f7f12ebddcc67517f2a3e0f2cf3b12208c419124b43fc44bfcb24", + "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:09c536200a0d497a9e20c4dcfe04b1f6228d52cba78504d09590d8895c709d7e", + "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:f9f7e0b4048b9d58422fa11d7b1b1b447b30d7ad0d65961b7fdb07c1da065ae8", + "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:33cfd07844bf59875f10d98ab07a0294e9ffba491076757772e1a5d6493fd48c", + "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:45537883066c0bf458e6a72bc6c3fd5fe2f6ca03b37428bc4847d44f1b17b033", + "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:6a70afb4a0a0562216d9bd3d2ce8ec61fef7b8410ae9635affc609c5b2f7659f", + "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:53e4bb1368bab589bcd248988e949bb882f14d5735c0735cdf2cba0f20edbb6f", + "libexcoding-v0.2.0-nif-2.15-aarch64-apple-darwin.so.tar.gz" => "sha256:d6c8ea8392f729de0df3d530fc6ab7f87b2425ea9eb813acb8f74db3cff34790", + "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:be8b078495bf599ed9b59c40bf803d040d2af524f22dea3ebae7b7fea1f11fc6", + "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:46abd9dfbd1cb258cd9ce47b6ed314bab977e8576f8ed1b4b1201be474d63314", + "libexcoding-v0.2.0-nif-2.15-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:93fd3d5f5eef4eeded1fd8cb7ad7ba7531e159eb77dbc849b387ef8053aeb7e6", + "libexcoding-v0.2.0-nif-2.15-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:fa78586fedb9c33ed662cdd596a9ea24942ade87e5f5fd681588ac2aa921b657", + "libexcoding-v0.2.0-nif-2.15-x86_64-apple-darwin.so.tar.gz" => "sha256:e8230040d453ad3eecc8fe2f626310632d540bdcfcc5e252798a705caff92226", + "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:4449dd9ddf3ed80d1934d2726b415f2a8dc653fcc26a1d9a0ff8ee92a8b61e58", + "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:7f793b37dd904d808a7fec5372993575015a59dcf4b182ccbe51d3abb5ab7fdc", + "libexcoding-v0.2.0-nif-2.16-aarch64-apple-darwin.so.tar.gz" => "sha256:e82bb2ab04394ed404d4c8e0ce7acbb10f8b6746ade644d94635ba3751d50f79", + "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:9f069ef03b190ba70b9c30e0ec5059aacd77c46c8699ddc2d715a7ef743f18d1", + "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:3d486ad0f6bd446ad12260df6da45a9a2ece103a084d38872f5a131b905a84ef", + "libexcoding-v0.2.0-nif-2.16-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:ded6211f9b10a23b0365e08a27fd24ab9512ec46efb43c552983d3d7198d6c1d", + "libexcoding-v0.2.0-nif-2.16-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:0740978414a2ad4244d602be98759f9a18ad443aeaad5ddd5f4bc9ce385beea2", + "libexcoding-v0.2.0-nif-2.16-x86_64-apple-darwin.so.tar.gz" => "sha256:95b8c2a48f85005f89aa6ed78f98c856ce7772a848504e27ab3e7f43d64969a0", + "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:1a230fb9aad95b3f680d868534b0af3f8e273c67481ee49799e484496eedf7b2", + "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:848e9e77ff85deba9f6634abed00cf730b246c2a3b875189e418c0243f20b733", + "libexcoding-v0.2.0-nif-2.17-aarch64-apple-darwin.so.tar.gz" => "sha256:d974a7356de7a832a1060618d5b6b67401c7e700630cb91e3b4b76e607b48ab5", + "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:7efbf6a260f6545ecccd59a8f4e05e61f85fb92c8768a9d356b934ff3c5580f5", + "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:34333f891d86b8868fb6b74c354e1b9d4ed05a2cec4b31f6eea286cc4be4254f", + "libexcoding-v0.2.0-nif-2.17-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:7714b79b3ee304317476dabc2a921dec8c5603ecb72115382091594c814ad69d", + "libexcoding-v0.2.0-nif-2.17-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:0e0f89fab8290ef8851e6f80793e06ef5735f94a4812d5ddf4ac58c23cabcc9a", + "libexcoding-v0.2.0-nif-2.17-x86_64-apple-darwin.so.tar.gz" => "sha256:0e8f099448b8e7cdd983648edb29726679acbf7ce94d994a06cd41c90299b7a1", + "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:10d8eacf8e5dd8cc3d4d967c06e38634d5408e64399014ffd6d100c6b2759182", + "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:ad5db21b33f3f065e03f05c520b95fc6ca0e2dae05cadc026b14f2c675480f83", } From 55092eaeb9bfdc606241820b1ab166423e892e86 Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 16:24:26 -0800 Subject: [PATCH 13/15] Add streaming decoder and BOM detection (v0.2.1) Streaming Decoder (Excoding.Decoder): - Stateful decoder for chunked/streaming data - Properly handles multibyte characters split across chunk boundaries - Essential for File.stream!/3 with Shift_JIS, GBK, Big5, etc. - New functions: new/1, decode_chunk/3, stream/2, stream_with_errors/2 BOM Detection: - detect_bom/1 - Detect encoding from Byte Order Mark - detect_and_strip_bom/1 - Detect and strip BOM in one step - Supports UTF-8, UTF-16LE, UTF-16BE Co-Authored-By: Claude Opus 4.5 --- CHANGELOG.md | 13 ++ README.md | 92 ++++++++--- lib/excoding.ex | 74 +++++++++ lib/excoding/decoder.ex | 274 +++++++++++++++++++++++++++++++ lib/excoding/native.ex | 8 + mix.exs | 5 +- mix.lock | 3 + native/excoding/Cargo.lock | 2 +- native/excoding/Cargo.toml | 2 +- native/excoding/src/lib.rs | 150 ++++++++++++++++- test/excoding/decoder_test.exs | 289 +++++++++++++++++++++++++++++++++ test/excoding_test.exs | 91 +++++++++++ 12 files changed, 973 insertions(+), 30 deletions(-) create mode 100644 lib/excoding/decoder.ex create mode 100644 test/excoding/decoder_test.exs diff --git a/CHANGELOG.md b/CHANGELOG.md index 31054ac..00834e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## v0.2.1 (2025-01-22) + +### New Features + +- **Streaming decoder** (`Excoding.Decoder`): Stateful decoder for chunked/streaming data that properly handles multibyte characters split across chunk boundaries. Essential for processing file streams or network data in encodings like Shift_JIS, GBK, Big5, etc. + - `Excoding.Decoder.new/1` - Create a stateful decoder + - `Excoding.Decoder.decode_chunk/3` - Decode a chunk with state preservation + - `Excoding.Decoder.stream/2` - Stream transformer for use with `File.stream!/3` + - `Excoding.Decoder.stream_with_errors/2` - Stream transformer with error tracking +- **BOM detection**: Detect encoding from Byte Order Marks (UTF-8, UTF-16LE, UTF-16BE) + - `detect_bom/1` - Detect BOM and return encoding name and BOM length + - `detect_and_strip_bom/1` - Detect BOM, strip it, and return encoding with remaining data + ## v0.2.0 (2025-01-22) ### Breaking Changes diff --git a/README.md b/README.md index ed77d03..5e9ef28 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,88 @@ # Excoding -String encoding/decoding NIF using rust [encoding](https://crates.io/crates/encoding) library. +High-performance string encoding/decoding using Rust's [encoding_rs](https://crates.io/crates/encoding_rs) library (the same encoding library used by Firefox). -## Installation +Supports all encodings in the [WHATWG Encoding Standard](https://encoding.spec.whatwg.org/): +- UTF-8, UTF-16LE, UTF-16BE +- Windows code pages (874, 1250-1258) +- ISO-8859 family (1-16) +- Asian encodings: Shift_JIS, EUC-JP, ISO-2022-JP, EUC-KR, GBK, GB18030, Big5 +- And more -The package can be installed by adding `excoding` to your list of dependencies in `mix.exs`: +## Installation ```elixir def deps do [ - {:excoding, "~> 0.1"} + {:excoding, "~> 0.2"} ] end ``` -It requires rustc and cargo to compile. Easiest way to install required tools is -to use excellent [rustup](https://rustup.rs/) script. +Precompiled binaries are available for common platforms. If a precompiled binary isn't available for your platform, you'll need Rust installed (use [rustup](https://rustup.rs/)). -## License +## Usage + +### One-Shot Encoding/Decoding + +For complete binaries where all data is available at once: + +```elixir +# Decode from Shift_JIS to UTF-8 +{:ok, string} = Excoding.decode(binary, "shift_jis") +string = Excoding.decode!(binary, "shift_jis") + +# Encode from UTF-8 to Windows-1252 +{:ok, binary} = Excoding.encode(string, "windows-1252") +binary = Excoding.encode!(string, "windows-1252") + +# Check if encoding is supported +Excoding.encoding_exists?("utf-8") # true + +# Get canonical name for an alias +Excoding.canonical_name("latin1") # {:ok, "windows-1252"} +``` -The MIT License (MIT) +### Streaming Decoding -Copyright (c) 2020, Milan Jaric. +For chunked data (file streams, network data), use `Excoding.Decoder` to properly handle multibyte characters that may be split across chunk boundaries: -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: +```elixir +# Stream a Shift_JIS file to UTF-8 +File.stream!("data.txt", [], 4096) +|> Excoding.Decoder.stream("shift_jis") +|> Enum.join() + +# Manual chunked decoding +{:ok, decoder} = Excoding.Decoder.new("shift_jis") +{:ok, out1, _errors} = Excoding.Decoder.decode_chunk(decoder, chunk1, false) +{:ok, out2, _errors} = Excoding.Decoder.decode_chunk(decoder, chunk2, false) +{:ok, out3, _errors} = Excoding.Decoder.decode_chunk(decoder, final_chunk, true) +result = out1 <> out2 <> out3 +``` -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. +**Why streaming matters**: Multibyte encodings like Shift_JIS use 2+ bytes per character. If a chunk boundary splits a character, the one-shot `decode/2` would see invalid bytes and produce replacement characters (`�`). The streaming decoder buffers incomplete sequences until the next chunk completes them. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. +### BOM Detection + +Detect encoding from a Byte Order Mark (BOM) at the start of a file: + +```elixir +# Detect BOM and get encoding +{:ok, "UTF-8", 3} = Excoding.detect_bom(<<0xEF, 0xBB, 0xBF, "hello">>) +{:ok, "UTF-16LE", 2} = Excoding.detect_bom(<<0xFF, 0xFE, ...>>) +{:ok, "UTF-16BE", 2} = Excoding.detect_bom(<<0xFE, 0xFF, ...>>) +{:error, :no_bom} = Excoding.detect_bom("no bom here") +# Detect and strip BOM in one step +{:ok, encoding, data_without_bom} = Excoding.detect_and_strip_bom(file_content) +{:ok, decoded} = Excoding.decode(data_without_bom, encoding) +``` + +## Dirty Schedulers + +Operations on binaries larger than 64KB automatically use dirty CPU schedulers to avoid blocking the BEAM. + +## License +MIT License - see LICENSE file for details. diff --git a/lib/excoding.ex b/lib/excoding.ex index 721f496..6e7f7d5 100644 --- a/lib/excoding.ex +++ b/lib/excoding.ex @@ -227,4 +227,78 @@ defmodule Excoding do def dirty_threshold do Native.dirty_threshold() end + + @doc """ + Detects the encoding from a Byte Order Mark (BOM) at the start of the data. + + BOMs are special byte sequences at the beginning of a file that indicate + the encoding. This function checks the first few bytes of the input and + returns the detected encoding if a BOM is found. + + Supported BOMs: + - UTF-8: `<<0xEF, 0xBB, 0xBF>>` (3 bytes) + - UTF-16LE: `<<0xFF, 0xFE>>` (2 bytes) + - UTF-16BE: `<<0xFE, 0xFF>>` (2 bytes) + + ## Returns + + - `{:ok, encoding, bom_length}` - BOM detected, returns encoding name and BOM size + - `{:error, :no_bom}` - No BOM found at the start of the data + + ## Examples + + iex> Excoding.detect_bom(<<0xEF, 0xBB, 0xBF, "hello">>) + {:ok, "UTF-8", 3} + + iex> Excoding.detect_bom(<<0xFF, 0xFE, 0x48, 0x00>>) + {:ok, "UTF-16LE", 2} + + iex> Excoding.detect_bom(<<0xFE, 0xFF, 0x00, 0x48>>) + {:ok, "UTF-16BE", 2} + + iex> Excoding.detect_bom("hello") + {:error, :no_bom} + + iex> Excoding.detect_bom(<<>>) + {:error, :no_bom} + """ + @spec detect_bom(binary()) :: {:ok, String.t(), non_neg_integer()} | {:error, :no_bom} + def detect_bom(data) when is_binary(data) do + case Native.detect_bom(data) do + {:ok, encoding, bom_length} -> {:ok, encoding, bom_length} + {:error, _, _} -> {:error, :no_bom} + end + end + + @doc """ + Detects the encoding from a BOM and strips it from the data. + + Convenience function that combines BOM detection with stripping the BOM + from the input data. Useful when you want to both detect the encoding + and get the data without the BOM prefix. + + ## Returns + + - `{:ok, encoding, data_without_bom}` - BOM detected and stripped + - `{:error, :no_bom}` - No BOM found, data unchanged + + ## Examples + + iex> Excoding.detect_and_strip_bom(<<0xEF, 0xBB, 0xBF, "hello">>) + {:ok, "UTF-8", "hello"} + + iex> Excoding.detect_and_strip_bom("hello") + {:error, :no_bom} + """ + @spec detect_and_strip_bom(binary()) :: {:ok, String.t(), binary()} | {:error, :no_bom} + def detect_and_strip_bom(data) when is_binary(data) do + case detect_bom(data) do + {:ok, encoding, bom_length} -> + <<_bom::binary-size(bom_length), rest::binary>> = data + {:ok, encoding, rest} + + {:error, :no_bom} -> + {:error, :no_bom} + end + end end diff --git a/lib/excoding/decoder.ex b/lib/excoding/decoder.ex new file mode 100644 index 0000000..2cd04e3 --- /dev/null +++ b/lib/excoding/decoder.ex @@ -0,0 +1,274 @@ +defmodule Excoding.Decoder do + @moduledoc """ + Stateful streaming decoder for converting encoded byte streams to UTF-8. + + This module provides a streaming API for decoding multibyte encodings + (like Shift_JIS, GBK, Big5, EUC-JP, etc.) where characters may be split + across chunk boundaries. + + ## Why Use Streaming Decoding? + + Multibyte encodings use variable-length byte sequences to represent characters. + For example, in Shift_JIS, the character "あ" is encoded as two bytes: `<<0x82, 0xA0>>`. + + When processing data in chunks (e.g., from `File.stream!/1` or network streams), + a character's bytes may be split across chunks: + + # Chunk 1 ends with first byte of "あ" + chunk1 = <<..., 0x82>> + # Chunk 2 starts with second byte of "あ" + chunk2 = <<0xA0, ...>> + + The one-shot `Excoding.decode/2` treats each chunk independently, so: + - Chunk 1's trailing `0x82` is invalid → replaced with `�` + - Chunk 2's leading `0xA0` is invalid → replaced with `�` + + The streaming decoder maintains state between chunks, properly buffering + incomplete sequences until completed. + + ## Usage + + ### Manual Chunked Decoding + + {:ok, decoder} = Excoding.Decoder.new("shift_jis") + + {:ok, output1, _} = Excoding.Decoder.decode_chunk(decoder, chunk1, false) + {:ok, output2, _} = Excoding.Decoder.decode_chunk(decoder, chunk2, false) + {:ok, output3, _} = Excoding.Decoder.decode_chunk(decoder, chunk3, true) + + result = output1 <> output2 <> output3 + + ### Stream-Based Decoding + + File.stream!("data.txt", [], 4096) + |> Excoding.Decoder.stream("shift_jis") + |> Enum.join() + + ## Important Notes + + - Always pass `is_last: true` for the final chunk to flush any buffered bytes + - The decoder resource is mutable; don't share it across concurrent processes + - For single complete binaries, use `Excoding.decode/2` instead (more efficient) + """ + + alias Excoding.Native + + @type t :: reference() + @type decode_result :: {:ok, String.t(), had_errors :: boolean()} + + @doc """ + Creates a new stateful decoder for the specified encoding. + + The decoder maintains internal state to handle multibyte characters + that may be split across chunk boundaries. + + ## Arguments + + - `encoding` - The source encoding label (e.g., "shift_jis", "gbk", "euc-jp") + + ## Returns + + - `{:ok, decoder}` on success + - `{:error, :unknown_encoding}` if the encoding is not recognized + + ## Examples + + iex> {:ok, decoder} = Excoding.Decoder.new("shift_jis") + iex> is_reference(decoder) + true + + iex> Excoding.Decoder.new("invalid-encoding") + {:error, :unknown_encoding} + """ + @spec new(String.t()) :: {:ok, t()} | {:error, :unknown_encoding} + def new(encoding) when is_binary(encoding) do + case Native.decoder_new(encoding) do + {:ok, decoder} when is_reference(decoder) -> {:ok, decoder} + {:error, _} -> {:error, :unknown_encoding} + end + end + + @doc """ + Creates a new stateful decoder, raising on error. + + ## Examples + + iex> decoder = Excoding.Decoder.new!("shift_jis") + iex> is_reference(decoder) + true + + iex> Excoding.Decoder.new!("invalid-encoding") + ** (ArgumentError) unknown encoding: invalid-encoding + """ + @spec new!(String.t()) :: t() + def new!(encoding) when is_binary(encoding) do + case new(encoding) do + {:ok, decoder} -> decoder + {:error, :unknown_encoding} -> raise ArgumentError, "unknown encoding: #{encoding}" + end + end + + @doc """ + Decodes a chunk of bytes using the stateful decoder. + + This function properly handles multibyte characters split across chunk + boundaries by maintaining decoder state between calls. + + ## Arguments + + - `decoder` - The decoder reference from `new/1` + - `chunk` - The binary chunk to decode + - `is_last` - Set to `true` for the final chunk (default: `false`) + + ## Returns + + - `{:ok, output, had_errors}` on success + - `output` - The decoded UTF-8 string for this chunk + - `had_errors` - `true` if any bytes were replaced with U+FFFD + + ## Behavior + + - When `is_last` is `false`: Incomplete byte sequences at the end of the + chunk are buffered internally and completed with the next chunk. + - When `is_last` is `true`: Any remaining incomplete sequences are replaced + with U+FFFD (the Unicode replacement character). + + ## Examples + + iex> {:ok, decoder} = Excoding.Decoder.new("shift_jis") + iex> # Shift_JIS "あ" is <<0x82, 0xA0>> - split across chunks + iex> {:ok, out1, false} = Excoding.Decoder.decode_chunk(decoder, <<0x82>>, false) + iex> {:ok, out2, false} = Excoding.Decoder.decode_chunk(decoder, <<0xA0>>, true) + iex> out1 <> out2 + "あ" + """ + @spec decode_chunk(t(), binary(), boolean()) :: decode_result() + def decode_chunk(decoder, chunk, is_last \\ false) + when is_reference(decoder) and is_binary(chunk) and is_boolean(is_last) do + if byte_size(chunk) > Native.dirty_threshold() do + Native.decoder_decode_chunk_dirty(decoder, chunk, is_last) + else + Native.decoder_decode_chunk(decoder, chunk, is_last) + end + end + + @doc """ + Decodes a chunk, raising on error. + + See `decode_chunk/3` for details. + + ## Examples + + iex> decoder = Excoding.Decoder.new!("utf-8") + iex> Excoding.Decoder.decode_chunk!(decoder, "hello", true) + {"hello", false} + """ + @spec decode_chunk!(t(), binary(), boolean()) :: {String.t(), boolean()} + def decode_chunk!(decoder, chunk, is_last \\ false) + when is_reference(decoder) and is_binary(chunk) and is_boolean(is_last) do + case decode_chunk(decoder, chunk, is_last) do + {:ok, output, had_errors} -> {output, had_errors} + end + end + + @doc """ + Creates a stream that decodes chunks from the given encoding to UTF-8. + + This is the recommended way to process streaming data in multibyte encodings. + It properly handles characters split across chunk boundaries. + + ## Arguments + + - `chunks` - An enumerable of binary chunks (e.g., from `File.stream!/3`) + - `encoding` - The source encoding label + + ## Returns + + A stream of decoded UTF-8 strings, one for each input chunk. + + ## Examples + + # Decode a Shift_JIS file + File.stream!("japanese.txt", [], 4096) + |> Excoding.Decoder.stream("shift_jis") + |> Enum.join() + + # Process line by line (after decoding) + File.stream!("data.csv", [], 8192) + |> Excoding.Decoder.stream("gbk") + |> Enum.join() + |> String.split("\\n") + + # With error tracking + File.stream!("data.txt", [], 4096) + |> Excoding.Decoder.stream_with_errors("windows-1252") + |> Enum.reduce({"", false}, fn {chunk, errors}, {acc, had_any} -> + {acc <> chunk, had_any or errors} + end) + + ## Notes + + - The stream automatically handles the `is_last` flag for the final chunk + - Each output element corresponds to one input chunk + - For better error visibility, use `stream_with_errors/2` + """ + @spec stream(Enumerable.t(), String.t()) :: Enumerable.t() + def stream(chunks, encoding) when is_binary(encoding) do + Stream.transform( + chunks, + fn -> new!(encoding) end, + fn chunk, decoder -> + {output, _had_errors} = decode_chunk!(decoder, chunk, false) + {[output], decoder} + end, + fn decoder -> + # Flush any remaining buffered bytes + {output, _had_errors} = decode_chunk!(decoder, <<>>, true) + + if output == "" do + {[], decoder} + else + {[output], decoder} + end + end, + fn _decoder -> :ok end + ) + end + + @doc """ + Creates a stream that decodes chunks, including error information. + + Like `stream/2`, but each element is a tuple `{decoded_string, had_errors}`. + + ## Examples + + File.stream!("data.txt", [], 4096) + |> Excoding.Decoder.stream_with_errors("shift_jis") + |> Enum.each(fn {chunk, had_errors} -> + if had_errors, do: Logger.warning("Encountered invalid bytes") + IO.write(chunk) + end) + """ + @spec stream_with_errors(Enumerable.t(), String.t()) :: Enumerable.t() + def stream_with_errors(chunks, encoding) when is_binary(encoding) do + Stream.transform( + chunks, + fn -> new!(encoding) end, + fn chunk, decoder -> + result = decode_chunk!(decoder, chunk, false) + {[result], decoder} + end, + fn decoder -> + result = decode_chunk!(decoder, <<>>, true) + {output, _had_errors} = result + + if output == "" do + {[], decoder} + else + {[result], decoder} + end + end, + fn _decoder -> :ok end + ) + end +end diff --git a/lib/excoding/native.ex b/lib/excoding/native.ex index 5ca58b0..6b2d897 100644 --- a/lib/excoding/native.ex +++ b/lib/excoding/native.ex @@ -27,4 +27,12 @@ defmodule Excoding.Native do def encoding_exists(_encoding), do: :erlang.nif_error(:nif_not_loaded) def canonical_name(_encoding), do: :erlang.nif_error(:nif_not_loaded) def list_encodings, do: :erlang.nif_error(:nif_not_loaded) + def detect_bom(_data), do: :erlang.nif_error(:nif_not_loaded) + + # Streaming decoder functions + def decoder_new(_encoding), do: :erlang.nif_error(:nif_not_loaded) + def decoder_decode_chunk(_decoder, _chunk, _is_last), do: :erlang.nif_error(:nif_not_loaded) + + def decoder_decode_chunk_dirty(_decoder, _chunk, _is_last), + do: :erlang.nif_error(:nif_not_loaded) end diff --git a/mix.exs b/mix.exs index fa62af9..266a9ee 100644 --- a/mix.exs +++ b/mix.exs @@ -1,7 +1,7 @@ defmodule Excoding.MixProject do use Mix.Project - @version "0.2.0" + @version "0.2.1" def project do [ @@ -27,7 +27,8 @@ defmodule Excoding.MixProject do [ {:rustler_precompiled, "~> 0.8"}, {:rustler, "~> 0.37", optional: true}, - {:ex_doc, "~> 0.31", only: :dev, runtime: false} + {:ex_doc, "~> 0.31", only: :dev, runtime: false}, + {:credo, "~> 1.7", only: [:dev, :test], runtime: false} ] end diff --git a/mix.lock b/mix.lock index 8394429..dd351c7 100644 --- a/mix.lock +++ b/mix.lock @@ -1,7 +1,10 @@ %{ + "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"}, "castore": {:hex, :castore, "1.0.17", "4f9770d2d45fbd91dcf6bd404cf64e7e58fed04fadda0923dc32acca0badffa2", [:mix], [], "hexpm", "12d24b9d80b910dd3953e165636d68f147a31db945d2dcb9365e441f8b5351e5"}, + "credo": {:hex, :credo, "1.7.15", "283da72eeb2fd3ccf7248f4941a0527efb97afa224bcdef30b4b580bc8258e1c", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "291e8645ea3fea7481829f1e1eb0881b8395db212821338e577a90bf225c5607"}, "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"}, "ex_doc": {:hex, :ex_doc, "0.40.0", "2635974389b80fd3ca61b0f993d459dad05b4a8f9b069dcfbbc5f6a8a6aef60e", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "c040735250e2752b6e1102eeb4aa3f1dca74c316db873ae09f955d42136e7e5b"}, + "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"}, "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"}, "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"}, diff --git a/native/excoding/Cargo.lock b/native/excoding/Cargo.lock index 1dd2f17..3f38309 100644 --- a/native/excoding/Cargo.lock +++ b/native/excoding/Cargo.lock @@ -19,7 +19,7 @@ dependencies = [ [[package]] name = "excoding" -version = "0.2.0" +version = "0.2.1" dependencies = [ "encoding_rs", "rustler", diff --git a/native/excoding/Cargo.toml b/native/excoding/Cargo.toml index 3afea08..a2b3ccb 100644 --- a/native/excoding/Cargo.toml +++ b/native/excoding/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "excoding" -version = "0.2.0" +version = "0.2.1" authors = [] edition = "2021" diff --git a/native/excoding/src/lib.rs b/native/excoding/src/lib.rs index 7ddea93..93bf4a3 100644 --- a/native/excoding/src/lib.rs +++ b/native/excoding/src/lib.rs @@ -10,10 +10,17 @@ //! - ISO-8859 family (1-16) //! - Asian encodings (Shift_JIS, EUC-JP, EUC-KR, GBK, GB18030, Big5) //! - And many more from the WHATWG Encoding Standard +//! +//! ## Streaming Support +//! +//! For streaming/chunked decoding of multibyte encodings, use the stateful +//! decoder API (`decoder_new`, `decoder_decode_chunk`) which properly handles +//! characters split across chunk boundaries. use encoding_rs::Encoding; -use rustler::{Atom, Binary, Env, NifResult, OwnedBinary}; +use rustler::{Atom, Binary, Env, NifResult, OwnedBinary, ResourceArc}; use std::io::Write; +use std::sync::Mutex; mod atoms { rustler::atoms! { @@ -22,9 +29,19 @@ mod atoms { unknown_encoding, encode_error, decode_error, + no_bom, } } +/// Stateful decoder resource for streaming decoding. +/// +/// Wraps an `encoding_rs::Decoder` in a Mutex for safe concurrent access +/// from the BEAM. The decoder maintains internal state for incomplete +/// multibyte sequences between chunk boundaries. +pub struct DecoderResource { + decoder: Mutex, +} + /// Threshold for using dirty schedulers (64KB) const DIRTY_THRESHOLD: usize = 64 * 1024; @@ -194,4 +211,133 @@ fn list_encodings() -> Vec<&'static str> { ] } -rustler::init!("Elixir.Excoding.Native"); +// ============================================================================= +// BOM Detection +// ============================================================================= + +/// Detects the encoding from a Byte Order Mark (BOM) at the start of the data. +/// +/// BOMs are special byte sequences at the beginning of a file that indicate +/// the encoding: +/// - UTF-8: EF BB BF (3 bytes) +/// - UTF-16LE: FF FE (2 bytes) +/// - UTF-16BE: FE FF (2 bytes) +/// +/// ## Arguments +/// * `data` - The binary data to check (only first 3 bytes are examined) +/// +/// ## Returns +/// * `{:ok, encoding_name, bom_length}` if a BOM is found +/// * `{:error, :no_bom}` if no BOM is present +#[rustler::nif] +fn detect_bom(data: Binary) -> (Atom, String, usize) { + match Encoding::for_bom(data.as_slice()) { + Some((encoding, bom_length)) => (atoms::ok(), encoding.name().to_string(), bom_length), + None => (atoms::error(), "no_bom".to_string(), 0), + } +} + +// ============================================================================= +// Streaming Decoder API +// ============================================================================= + +/// Creates a new stateful decoder for the specified encoding. +/// +/// The decoder maintains internal state for handling multibyte characters +/// that may be split across chunk boundaries in streaming scenarios. +/// +/// ## Arguments +/// * `enc` - The source encoding label (WHATWG format, e.g., "shift_jis") +/// +/// ## Returns +/// * `{:ok, decoder_ref}` on success +/// * `{:error, :unknown_encoding}` if encoding label is not recognized +#[rustler::nif] +fn decoder_new(enc: &str) -> (Atom, Option>) { + match Encoding::for_label(enc.as_bytes()) { + Some(encoding) => { + let decoder = encoding.new_decoder(); + let resource = ResourceArc::new(DecoderResource { + decoder: Mutex::new(decoder), + }); + (atoms::ok(), Some(resource)) + } + None => (atoms::error(), None), + } +} + +/// Decodes a chunk of bytes using the stateful decoder. +/// +/// This function properly handles multibyte characters split across chunk +/// boundaries by maintaining decoder state between calls. +/// +/// ## Arguments +/// * `decoder_ref` - The decoder resource from `decoder_new` +/// * `chunk` - The binary chunk to decode +/// * `is_last` - Set to `true` for the final chunk to flush any remaining state +/// +/// ## Returns +/// * `{:ok, output_string, had_errors}` on success +/// - `output_string`: The decoded UTF-8 string for this chunk +/// - `had_errors`: `true` if any bytes were replaced with U+FFFD +/// +/// ## Notes +/// When `is_last` is `false`, incomplete byte sequences at the end of the chunk +/// are buffered internally and will be completed with the next chunk. +/// When `is_last` is `true`, any incomplete sequences are replaced with U+FFFD. +#[rustler::nif] +fn decoder_decode_chunk( + decoder_ref: ResourceArc, + chunk: Binary, + is_last: bool, +) -> NifResult<(Atom, String, bool)> { + let mut decoder = decoder_ref + .decoder + .lock() + .map_err(|_| rustler::Error::Term(Box::new("lock_poisoned")))?; + + let input = chunk.as_slice(); + + // Calculate maximum output size: worst case is 3 bytes per input byte for UTF-8 + // plus potential replacement characters + let max_output_len = decoder + .max_utf8_buffer_length(input.len()) + .unwrap_or(input.len() * 3 + 3); + + let mut output = String::with_capacity(max_output_len); + + let (_result, _read, had_errors) = decoder.decode_to_string(input, &mut output, is_last); + + Ok((atoms::ok(), output, had_errors)) +} + +/// Decodes a chunk using a dirty CPU scheduler for large chunks. +#[rustler::nif(schedule = "DirtyCpu")] +fn decoder_decode_chunk_dirty( + decoder_ref: ResourceArc, + chunk: Binary, + is_last: bool, +) -> NifResult<(Atom, String, bool)> { + let mut decoder = decoder_ref + .decoder + .lock() + .map_err(|_| rustler::Error::Term(Box::new("lock_poisoned")))?; + + let input = chunk.as_slice(); + let max_output_len = decoder + .max_utf8_buffer_length(input.len()) + .unwrap_or(input.len() * 3 + 3); + + let mut output = String::with_capacity(max_output_len); + let (_result, _read, had_errors) = decoder.decode_to_string(input, &mut output, is_last); + + Ok((atoms::ok(), output, had_errors)) +} + +#[allow(non_local_definitions)] +fn on_load(env: Env, _info: rustler::Term) -> bool { + let _ = rustler::resource!(DecoderResource, env); + true +} + +rustler::init!("Elixir.Excoding.Native", load = on_load); diff --git a/test/excoding/decoder_test.exs b/test/excoding/decoder_test.exs new file mode 100644 index 0000000..cd06437 --- /dev/null +++ b/test/excoding/decoder_test.exs @@ -0,0 +1,289 @@ +defmodule Excoding.DecoderTest do + use ExUnit.Case, async: true + doctest Excoding.Decoder + + alias Excoding.Decoder + + describe "new/1" do + test "creates decoder for valid encoding" do + assert {:ok, decoder} = Decoder.new("shift_jis") + assert is_reference(decoder) + end + + test "creates decoder for encoding aliases" do + assert {:ok, _} = Decoder.new("sjis") + assert {:ok, _} = Decoder.new("Shift_JIS") + assert {:ok, _} = Decoder.new("utf-8") + assert {:ok, _} = Decoder.new("UTF-8") + end + + test "returns error for unknown encoding" do + assert {:error, :unknown_encoding} = Decoder.new("not-an-encoding") + end + end + + describe "new!/1" do + test "creates decoder for valid encoding" do + decoder = Decoder.new!("gbk") + assert is_reference(decoder) + end + + test "raises for unknown encoding" do + assert_raise ArgumentError, ~r/unknown encoding/, fn -> + Decoder.new!("invalid") + end + end + end + + describe "decode_chunk/3" do + test "decodes complete single-byte chunk" do + {:ok, decoder} = Decoder.new("utf-8") + assert {:ok, "hello", false} = Decoder.decode_chunk(decoder, "hello", true) + end + + test "decodes complete multibyte chunk" do + {:ok, decoder} = Decoder.new("shift_jis") + # "あ" in Shift_JIS is <<0x82, 0xA0>> + assert {:ok, "あ", false} = Decoder.decode_chunk(decoder, <<0x82, 0xA0>>, true) + end + + test "handles split multibyte character - the critical fix" do + {:ok, decoder} = Decoder.new("shift_jis") + + # "あ" (<<0x82, 0xA0>>) split across two chunks + # First chunk: incomplete character, should buffer it + {:ok, output1, false} = Decoder.decode_chunk(decoder, <<0x82>>, false) + # Second chunk: completes the character + {:ok, output2, false} = Decoder.decode_chunk(decoder, <<0xA0>>, true) + + assert output1 <> output2 == "あ" + end + + test "handles multiple split characters" do + {:ok, decoder} = Decoder.new("shift_jis") + + # "あい" = <<0x82, 0xA0, 0x82, 0xA2>> in Shift_JIS + # Split: <<0x82>> | <<0xA0, 0x82>> | <<0xA2>> + {:ok, out1, false} = Decoder.decode_chunk(decoder, <<0x82>>, false) + {:ok, out2, false} = Decoder.decode_chunk(decoder, <<0xA0, 0x82>>, false) + {:ok, out3, false} = Decoder.decode_chunk(decoder, <<0xA2>>, true) + + assert out1 <> out2 <> out3 == "あい" + end + + test "handles mixed ASCII and multibyte split" do + {:ok, decoder} = Decoder.new("shift_jis") + + # "Aあ" = <<0x41, 0x82, 0xA0>> in Shift_JIS + # Split after ASCII: <<0x41, 0x82>> | <<0xA0>> + {:ok, out1, false} = Decoder.decode_chunk(decoder, <<0x41, 0x82>>, false) + {:ok, out2, false} = Decoder.decode_chunk(decoder, <<0xA0>>, true) + + assert out1 <> out2 == "Aあ" + end + + test "reports errors for invalid bytes" do + {:ok, decoder} = Decoder.new("utf-8") + # 0xFF is invalid UTF-8 + {:ok, output, had_errors} = Decoder.decode_chunk(decoder, <<0xFF>>, true) + assert had_errors == true + assert output == "�" + end + + test "flushes incomplete sequence on is_last=true" do + {:ok, decoder} = Decoder.new("shift_jis") + + # Incomplete character at end with is_last=true should produce replacement + {:ok, output, had_errors} = Decoder.decode_chunk(decoder, <<0x82>>, true) + assert had_errors == true + assert output == "�" + end + + test "handles empty chunk" do + {:ok, decoder} = Decoder.new("utf-8") + assert {:ok, "", false} = Decoder.decode_chunk(decoder, <<>>, false) + assert {:ok, "", false} = Decoder.decode_chunk(decoder, <<>>, true) + end + + test "GBK encoding with split character" do + {:ok, decoder} = Decoder.new("gbk") + + # "中" in GBK is <<0xD6, 0xD0>> + {:ok, out1, false} = Decoder.decode_chunk(decoder, <<0xD6>>, false) + {:ok, out2, false} = Decoder.decode_chunk(decoder, <<0xD0>>, true) + + assert out1 <> out2 == "中" + end + + test "Big5 encoding with split character" do + {:ok, decoder} = Decoder.new("big5") + + # "中" in Big5 is <<0xA4, 0xA4>> + {:ok, out1, false} = Decoder.decode_chunk(decoder, <<0xA4>>, false) + {:ok, out2, false} = Decoder.decode_chunk(decoder, <<0xA4>>, true) + + assert out1 <> out2 == "中" + end + + test "EUC-JP encoding with split character" do + {:ok, decoder} = Decoder.new("euc-jp") + + # "あ" in EUC-JP is <<0xA4, 0xA2>> + {:ok, out1, false} = Decoder.decode_chunk(decoder, <<0xA4>>, false) + {:ok, out2, false} = Decoder.decode_chunk(decoder, <<0xA2>>, true) + + assert out1 <> out2 == "あ" + end + end + + describe "decode_chunk!/3" do + test "returns tuple without :ok" do + decoder = Decoder.new!("utf-8") + assert {"hello", false} = Decoder.decode_chunk!(decoder, "hello", true) + end + end + + describe "stream/2" do + test "decodes stream of chunks" do + # "あいう" in Shift_JIS = <<0x82, 0xA0, 0x82, 0xA2, 0x82, 0xA4>> + chunks = [<<0x82, 0xA0>>, <<0x82, 0xA2>>, <<0x82, 0xA4>>] + + result = + chunks + |> Decoder.stream("shift_jis") + |> Enum.join() + + assert result == "あいう" + end + + test "handles split characters in stream" do + # "あ" split across chunks + chunks = [<<0x82>>, <<0xA0>>] + + result = + chunks + |> Decoder.stream("shift_jis") + |> Enum.join() + + assert result == "あ" + end + + test "handles complex split pattern" do + # "ABあいCD" in Shift_JIS + # Split in awkward places + chunks = [ + <<0x41, 0x42, 0x82>>, + <<0xA0, 0x82>>, + <<0xA2, 0x43, 0x44>> + ] + + result = + chunks + |> Decoder.stream("shift_jis") + |> Enum.join() + + assert result == "ABあいCD" + end + + test "handles empty chunks in stream" do + chunks = [<<>>, <<0x82, 0xA0>>, <<>>, <<0x82, 0xA2>>, <<>>] + + result = + chunks + |> Decoder.stream("shift_jis") + |> Enum.join() + + assert result == "あい" + end + + test "works with single-byte encodings" do + # ISO-8859-1 "café" + chunks = [<<99, 97>>, <<102, 233>>] + + result = + chunks + |> Decoder.stream("iso-8859-1") + |> Enum.join() + + assert result == "café" + end + + test "handles UTF-16LE with split surrogate" do + # UTF-16LE "A" is <<0x41, 0x00>> + # Split the 2-byte sequence + chunks = [<<0x41>>, <<0x00>>] + + result = + chunks + |> Decoder.stream("utf-16le") + |> Enum.join() + + assert result == "A" + end + end + + describe "stream_with_errors/2" do + test "includes error information" do + # Mix of valid and invalid UTF-8 + chunks = [<<"hello">>, <<0xFF>>, <<"world">>] + + results = + chunks + |> Decoder.stream_with_errors("utf-8") + |> Enum.to_list() + + assert [{"hello", false}, {"�", true}, {"world", false} | _] = results + end + + test "tracks errors across split characters" do + chunks = [<<0x82>>, <<0xA0>>] + + results = + chunks + |> Decoder.stream_with_errors("shift_jis") + |> Enum.to_list() + + # First chunk outputs nothing (buffered), second outputs the character + # Neither should have errors since it's valid split + outputs = Enum.map(results, fn {out, _} -> out end) + errors = Enum.map(results, fn {_, err} -> err end) + + assert Enum.join(outputs) == "あ" + assert Enum.all?(errors, &(&1 == false)) + end + end + + describe "comparison with one-shot decode (demonstrating the bug fix)" do + test "one-shot decode corrupts split multibyte characters" do + # This demonstrates the bug that streaming fixes + chunks = [<<0x82>>, <<0xA0>>] + + # One-shot decode of each chunk independently (the bug) + one_shot_result = Enum.map_join(chunks, &Excoding.decode!(&1, "shift_jis")) + + # Streaming decode (the fix) + streaming_result = + chunks + |> Decoder.stream("shift_jis") + |> Enum.join() + + # One-shot produces replacement characters (corruption) + assert one_shot_result == "��" + + # Streaming produces the correct character + assert streaming_result == "あ" + end + + test "one-shot is fine for complete input" do + # When input is complete, both approaches work + complete_input = <<0x82, 0xA0>> + + one_shot = Excoding.decode!(complete_input, "shift_jis") + streaming = [complete_input] |> Decoder.stream("shift_jis") |> Enum.join() + + assert one_shot == "あ" + assert streaming == "あ" + assert one_shot == streaming + end + end +end diff --git a/test/excoding_test.exs b/test/excoding_test.exs index 8f313ff..9ea19f0 100644 --- a/test/excoding_test.exs +++ b/test/excoding_test.exs @@ -149,4 +149,95 @@ defmodule ExcodingTest do assert decoded_kr == original_kr end end + + describe "detect_bom/1" do + test "detects UTF-8 BOM" do + # UTF-8 BOM: EF BB BF + data = <<0xEF, 0xBB, 0xBF, "hello world">> + assert {:ok, "UTF-8", 3} = Excoding.detect_bom(data) + end + + test "detects UTF-16LE BOM" do + # UTF-16LE BOM: FF FE + data = <<0xFF, 0xFE, 0x48, 0x00, 0x69, 0x00>> + assert {:ok, "UTF-16LE", 2} = Excoding.detect_bom(data) + end + + test "detects UTF-16BE BOM" do + # UTF-16BE BOM: FE FF + data = <<0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69>> + assert {:ok, "UTF-16BE", 2} = Excoding.detect_bom(data) + end + + test "returns error when no BOM present" do + assert {:error, :no_bom} = Excoding.detect_bom("hello world") + assert {:error, :no_bom} = Excoding.detect_bom(<<0x48, 0x65, 0x6C, 0x6C, 0x6F>>) + end + + test "returns error for empty binary" do + assert {:error, :no_bom} = Excoding.detect_bom(<<>>) + end + + test "returns error for partial BOM" do + # Only first byte of UTF-8 BOM + assert {:error, :no_bom} = Excoding.detect_bom(<<0xEF>>) + # Only first two bytes of UTF-8 BOM + assert {:error, :no_bom} = Excoding.detect_bom(<<0xEF, 0xBB>>) + end + end + + describe "detect_and_strip_bom/1" do + test "detects and strips UTF-8 BOM" do + data = <<0xEF, 0xBB, 0xBF, "hello">> + assert {:ok, "UTF-8", "hello"} = Excoding.detect_and_strip_bom(data) + end + + test "detects and strips UTF-16LE BOM" do + # "Hi" in UTF-16LE with BOM + data = <<0xFF, 0xFE, 0x48, 0x00, 0x69, 0x00>> + assert {:ok, "UTF-16LE", <<0x48, 0x00, 0x69, 0x00>>} = Excoding.detect_and_strip_bom(data) + end + + test "detects and strips UTF-16BE BOM" do + # "Hi" in UTF-16BE with BOM + data = <<0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69>> + assert {:ok, "UTF-16BE", <<0x00, 0x48, 0x00, 0x69>>} = Excoding.detect_and_strip_bom(data) + end + + test "returns error when no BOM present" do + assert {:error, :no_bom} = Excoding.detect_and_strip_bom("hello") + end + + test "works with BOM-only data" do + assert {:ok, "UTF-8", ""} = Excoding.detect_and_strip_bom(<<0xEF, 0xBB, 0xBF>>) + end + end + + describe "BOM detection integration" do + test "detect BOM and decode file content" do + # Simulate a UTF-8 file with BOM + content = "こんにちは" + file_data = <<0xEF, 0xBB, 0xBF>> <> content + + # Detect encoding and strip BOM + {:ok, encoding, data_without_bom} = Excoding.detect_and_strip_bom(file_data) + + # Decode using detected encoding + {:ok, decoded} = Excoding.decode(data_without_bom, encoding) + + assert encoding == "UTF-8" + assert decoded == content + end + + test "detect BOM and decode UTF-16LE content" do + # "Hi" in UTF-16LE with BOM + file_data = <<0xFF, 0xFE, 0x48, 0x00, 0x69, 0x00>> + + {:ok, encoding, data_without_bom} = Excoding.detect_and_strip_bom(file_data) + {:ok, decoded} = Excoding.decode(data_without_bom, encoding) + + assert encoding == "UTF-16LE" + assert decoded == "Hi" + end + end end From 32cda9d33c2467a3fa25462f1b1bb3b54aa7e3f1 Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 16:29:40 -0800 Subject: [PATCH 14/15] Add v0.2.1 checksums --- checksum-Elixir.Excoding.Native.exs | 60 ++++++++++++++--------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/checksum-Elixir.Excoding.Native.exs b/checksum-Elixir.Excoding.Native.exs index 35d91c9..feab3d8 100644 --- a/checksum-Elixir.Excoding.Native.exs +++ b/checksum-Elixir.Excoding.Native.exs @@ -1,32 +1,32 @@ %{ - "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:09c536200a0d497a9e20c4dcfe04b1f6228d52cba78504d09590d8895c709d7e", - "excoding-v0.2.0-nif-2.15-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:f9f7e0b4048b9d58422fa11d7b1b1b447b30d7ad0d65961b7fdb07c1da065ae8", - "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:33cfd07844bf59875f10d98ab07a0294e9ffba491076757772e1a5d6493fd48c", - "excoding-v0.2.0-nif-2.16-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:45537883066c0bf458e6a72bc6c3fd5fe2f6ca03b37428bc4847d44f1b17b033", - "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:6a70afb4a0a0562216d9bd3d2ce8ec61fef7b8410ae9635affc609c5b2f7659f", - "excoding-v0.2.0-nif-2.17-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:53e4bb1368bab589bcd248988e949bb882f14d5735c0735cdf2cba0f20edbb6f", - "libexcoding-v0.2.0-nif-2.15-aarch64-apple-darwin.so.tar.gz" => "sha256:d6c8ea8392f729de0df3d530fc6ab7f87b2425ea9eb813acb8f74db3cff34790", - "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:be8b078495bf599ed9b59c40bf803d040d2af524f22dea3ebae7b7fea1f11fc6", - "libexcoding-v0.2.0-nif-2.15-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:46abd9dfbd1cb258cd9ce47b6ed314bab977e8576f8ed1b4b1201be474d63314", - "libexcoding-v0.2.0-nif-2.15-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:93fd3d5f5eef4eeded1fd8cb7ad7ba7531e159eb77dbc849b387ef8053aeb7e6", - "libexcoding-v0.2.0-nif-2.15-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:fa78586fedb9c33ed662cdd596a9ea24942ade87e5f5fd681588ac2aa921b657", - "libexcoding-v0.2.0-nif-2.15-x86_64-apple-darwin.so.tar.gz" => "sha256:e8230040d453ad3eecc8fe2f626310632d540bdcfcc5e252798a705caff92226", - "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:4449dd9ddf3ed80d1934d2726b415f2a8dc653fcc26a1d9a0ff8ee92a8b61e58", - "libexcoding-v0.2.0-nif-2.15-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:7f793b37dd904d808a7fec5372993575015a59dcf4b182ccbe51d3abb5ab7fdc", - "libexcoding-v0.2.0-nif-2.16-aarch64-apple-darwin.so.tar.gz" => "sha256:e82bb2ab04394ed404d4c8e0ce7acbb10f8b6746ade644d94635ba3751d50f79", - "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:9f069ef03b190ba70b9c30e0ec5059aacd77c46c8699ddc2d715a7ef743f18d1", - "libexcoding-v0.2.0-nif-2.16-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:3d486ad0f6bd446ad12260df6da45a9a2ece103a084d38872f5a131b905a84ef", - "libexcoding-v0.2.0-nif-2.16-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:ded6211f9b10a23b0365e08a27fd24ab9512ec46efb43c552983d3d7198d6c1d", - "libexcoding-v0.2.0-nif-2.16-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:0740978414a2ad4244d602be98759f9a18ad443aeaad5ddd5f4bc9ce385beea2", - "libexcoding-v0.2.0-nif-2.16-x86_64-apple-darwin.so.tar.gz" => "sha256:95b8c2a48f85005f89aa6ed78f98c856ce7772a848504e27ab3e7f43d64969a0", - "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:1a230fb9aad95b3f680d868534b0af3f8e273c67481ee49799e484496eedf7b2", - "libexcoding-v0.2.0-nif-2.16-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:848e9e77ff85deba9f6634abed00cf730b246c2a3b875189e418c0243f20b733", - "libexcoding-v0.2.0-nif-2.17-aarch64-apple-darwin.so.tar.gz" => "sha256:d974a7356de7a832a1060618d5b6b67401c7e700630cb91e3b4b76e607b48ab5", - "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:7efbf6a260f6545ecccd59a8f4e05e61f85fb92c8768a9d356b934ff3c5580f5", - "libexcoding-v0.2.0-nif-2.17-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:34333f891d86b8868fb6b74c354e1b9d4ed05a2cec4b31f6eea286cc4be4254f", - "libexcoding-v0.2.0-nif-2.17-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:7714b79b3ee304317476dabc2a921dec8c5603ecb72115382091594c814ad69d", - "libexcoding-v0.2.0-nif-2.17-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:0e0f89fab8290ef8851e6f80793e06ef5735f94a4812d5ddf4ac58c23cabcc9a", - "libexcoding-v0.2.0-nif-2.17-x86_64-apple-darwin.so.tar.gz" => "sha256:0e8f099448b8e7cdd983648edb29726679acbf7ce94d994a06cd41c90299b7a1", - "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:10d8eacf8e5dd8cc3d4d967c06e38634d5408e64399014ffd6d100c6b2759182", - "libexcoding-v0.2.0-nif-2.17-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:ad5db21b33f3f065e03f05c520b95fc6ca0e2dae05cadc026b14f2c675480f83", + "excoding-v0.2.1-nif-2.15-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:a88631d5d0cf216fb2781e451ad2e3677093faad0a096a935c40b0f7ae2d9cea", + "excoding-v0.2.1-nif-2.15-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:2380226daa95a91fdb358ccd5d4d3d68c7a921486bcd9e95c11a950fad7435e6", + "excoding-v0.2.1-nif-2.16-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:ff60b5ff8ed95b40a0812e38239f71411bfc48d81e476725d9113c1daae1c11f", + "excoding-v0.2.1-nif-2.16-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:67d437edc507cae0166e07d08559b4a9d704c902bca2e4bda3b73c2333a663c4", + "excoding-v0.2.1-nif-2.17-x86_64-pc-windows-gnu.dll.tar.gz" => "sha256:a5335431ea0a27be028150350096b584944acd2f16e0da67981ca580c0caf66d", + "excoding-v0.2.1-nif-2.17-x86_64-pc-windows-msvc.dll.tar.gz" => "sha256:66a7f5bbb0ab8257fa6aa28f6d2f1298d0714d6b99a266cc4a302a06b10818c8", + "libexcoding-v0.2.1-nif-2.15-aarch64-apple-darwin.so.tar.gz" => "sha256:f59a9f573790d48730463ed8b75f83780d89cc8e488c444ef0e4600ba2847660", + "libexcoding-v0.2.1-nif-2.15-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:f0dfcd78bdb2f773f5dd9e04cf548cfbde3e41d21ab728e593fedf4727f09a8d", + "libexcoding-v0.2.1-nif-2.15-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:f6d040da91c4c6479dc3faec18ec19a1c4d82116cfcc912c4f21bf2a488f39ab", + "libexcoding-v0.2.1-nif-2.15-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:80c933bb067f8bbc9f189df2aa0993214fc5dc7b5e3116349e44768c05147d3d", + "libexcoding-v0.2.1-nif-2.15-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:ae445a0182319ef87ce419ee8499f5d02f1c775f00d7d3358a000c1b85267683", + "libexcoding-v0.2.1-nif-2.15-x86_64-apple-darwin.so.tar.gz" => "sha256:fb4900c1f354383aa713d79c5069d70a3dee64adfd4374f52c469ecbdd16af83", + "libexcoding-v0.2.1-nif-2.15-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:4bde0fd9e27e857600cd051da94fe78218b8851f1f823d93d96531aa2f7ee25d", + "libexcoding-v0.2.1-nif-2.15-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:44f67897e62c7ee976c808b14d7dc2fb571e9a10ffe2dd9d0ad037695a3d5911", + "libexcoding-v0.2.1-nif-2.16-aarch64-apple-darwin.so.tar.gz" => "sha256:6f6719dc150c0fbe60187d577e14a4e41847da51508c48d25ccc6084b9d47c5c", + "libexcoding-v0.2.1-nif-2.16-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:1a8086bd3c427b4d00583dbf988f69555284e3262b803f73c08d785dd3c02c87", + "libexcoding-v0.2.1-nif-2.16-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:035d997775604dc0862d774a83b357b0466575653379ac022946125c4e6d535d", + "libexcoding-v0.2.1-nif-2.16-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:e3485bcd5d7d421444a9cddc381035812a9e82a0eeb2eb041c37686d8b4b1f6b", + "libexcoding-v0.2.1-nif-2.16-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:dca2e512d5d263bd6bc629082860d55a053f18a1d65f5d77a1597706dc582292", + "libexcoding-v0.2.1-nif-2.16-x86_64-apple-darwin.so.tar.gz" => "sha256:0a3d7e14ed95b417cb4bfe075f25474307fd15e49d98400c4c2c42b9b82fa3ff", + "libexcoding-v0.2.1-nif-2.16-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:5680a383ed4eb1520bcbbacf33e1e9f29d7f1de28de67094616c4d9ab731a212", + "libexcoding-v0.2.1-nif-2.16-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:63cd273cf13e4233bdb2a4ffb1302a0104b671316fa2aec9796003fb74198a67", + "libexcoding-v0.2.1-nif-2.17-aarch64-apple-darwin.so.tar.gz" => "sha256:674005ace67c665cc42b0c6057b7c61d6a9d7c9d904779daba13ddd69573fca4", + "libexcoding-v0.2.1-nif-2.17-aarch64-unknown-linux-gnu.so.tar.gz" => "sha256:c7177d23a108c2964101ef59fc9910200eec74b270adb1b97933692e56923532", + "libexcoding-v0.2.1-nif-2.17-aarch64-unknown-linux-musl.so.tar.gz" => "sha256:e74544f3779ec3a1b0b2689c3f1ca2bc8915c3d1ef32e8308bb51388ef77a0d5", + "libexcoding-v0.2.1-nif-2.17-arm-unknown-linux-gnueabihf.so.tar.gz" => "sha256:2e2370d0e13e49b139c4c1ca2f2e16228e47e0162fcaf84f2d1b51ae71780835", + "libexcoding-v0.2.1-nif-2.17-riscv64gc-unknown-linux-gnu.so.tar.gz" => "sha256:c9366f7a4c9ca5046e3057a05cc9e0fa1e9f6da167439121a5a3e32256b8ca68", + "libexcoding-v0.2.1-nif-2.17-x86_64-apple-darwin.so.tar.gz" => "sha256:158a10931a0ce593b767d7afce68f0b072290bb514d530523e0af03b8b2728f7", + "libexcoding-v0.2.1-nif-2.17-x86_64-unknown-linux-gnu.so.tar.gz" => "sha256:efae92d75d7860965b4db8cb462cdfdac044352bdc3a37606428ee99e3bdb6f2", + "libexcoding-v0.2.1-nif-2.17-x86_64-unknown-linux-musl.so.tar.gz" => "sha256:e46cc5c2d2f6c728028fd412b6c4e4b69da4ea484517a8ae8340f9cbf47f29c5", } From 9cf8018bd1c0481553732dff2bb2ac1ddf8fe606 Mon Sep 17 00:00:00 2001 From: jeffhuen <32542276+jeffhuen@users.noreply.github.com> Date: Thu, 22 Jan 2026 16:36:34 -0800 Subject: [PATCH 15/15] Add workflow_dispatch trigger for manual builds Allows triggering NIF builds before tagging, so tags can include checksums. - Manual builds create draft releases - Verifies version matches mix.exs - Keeps tag-triggered builds as fallback --- .github/workflows/rustler_precompiled.yml | 30 ++++++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rustler_precompiled.yml b/.github/workflows/rustler_precompiled.yml index 2b748e0..dfb992f 100644 --- a/.github/workflows/rustler_precompiled.yml +++ b/.github/workflows/rustler_precompiled.yml @@ -1,6 +1,12 @@ name: Build precompiled NIFs on: + workflow_dispatch: + inputs: + version: + description: 'Version to build (e.g., 0.2.1) - must match mix.exs' + required: true + type: string push: tags: - 'v*' @@ -32,10 +38,25 @@ jobs: - name: Checkout source code uses: actions/checkout@v4 - - name: Extract project version + - name: Set project version shell: bash run: | - echo "PROJECT_VERSION=$(sed -n 's/^ @version "\(.*\)"/\1/p' mix.exs | head -n1)" >> $GITHUB_ENV + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "PROJECT_VERSION=${{ inputs.version }}" >> $GITHUB_ENV + else + # Extract from tag (v0.2.1 -> 0.2.1) + echo "PROJECT_VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV + fi + + - name: Verify version matches mix.exs + shell: bash + run: | + MIX_VERSION=$(sed -n 's/^ @version "\(.*\)"/\1/p' mix.exs | head -n1) + if [ "$MIX_VERSION" != "$PROJECT_VERSION" ]; then + echo "::error::Version mismatch! Input: $PROJECT_VERSION, mix.exs: $MIX_VERSION" + exit 1 + fi + echo "Building version $PROJECT_VERSION" - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable @@ -60,9 +81,10 @@ jobs: name: ${{ steps.build-crate.outputs.file-name }} path: ${{ steps.build-crate.outputs.file-path }} - - name: Publish archives and packages + - name: Publish to GitHub Release uses: softprops/action-gh-release@v2 with: + tag_name: v${{ env.PROJECT_VERSION }} + draft: ${{ github.event_name == 'workflow_dispatch' }} files: | ${{ steps.build-crate.outputs.file-path }} - if: startsWith(github.ref, 'refs/tags/')