Skip to content

bytesense

Examples

Examples¶

Runnable scripts live in the repository under examples/. Install bytesense (pip install bytesense or pip install -e . from a checkout), then run e.g. python examples/basic_detect.py from the repo root.

Basic detection (`from_bytes`, `from_path`)¶

#!/usr/bin/env python3
"""Detect encoding from raw bytes and from a file (from_bytes, from_path)."""
from __future__ import annotations

from pathlib import Path

from bytesense import from_bytes, from_path


def main() -> None:
    text = "Café naïve — résumé"
    data = text.encode("utf-8")
    r = from_bytes(data)
    print("from_bytes:")
    print(f"  encoding={r.encoding!r}  confidence={r.confidence:.3f}  language={r.language!r}")
    print(f"  bytes={r.byte_count}  bom={r.bom_detected}")

    root = Path(__file__).resolve().parent.parent
    readme = root / "README.md"
    if readme.is_file():
        r2 = from_path(readme)
        print("\nfrom_path (README.md):")
        print(f"  encoding={r2.encoding!r}  confidence={r2.confidence:.3f}")
    else:
        print("\n(Skip from_path: README.md not found next to repo root.)")


if __name__ == "__main__":
    main()

Chardet-style `detect()` dict¶

#!/usr/bin/env python3
"""Drop-in style dict like chardet / charset-normalizer (detect)."""
from __future__ import annotations

from bytesense import detect


def main() -> None:
    samples = [
        b"ASCII only",
        "Здравствуй, мир".encode("cp1251"),
        "こんにちは".encode("shift_jis"),
    ]
    for raw in samples:
        d = detect(raw)
        print(d)


if __name__ == "__main__":
    main()

Streaming chunks (`StreamDetector`)¶

#!/usr/bin/env python3
"""Incremental detection with StreamDetector (chunked input)."""
from __future__ import annotations

from bytesense import StreamDetector


def main() -> None:
    payload = "Streamed UTF-8: naïve café".encode("utf-8")
    det = StreamDetector()
    chunk_size = 5
    for i in range(0, len(payload), chunk_size):
        det.feed(payload[i : i + chunk_size])
    det.finalize()
    print(f"encoding={det.encoding!r}")
    print(f"confidence={det.confidence:.4f}")
    print(f"language={det.language!r}")
    print(f"stable={getattr(det, 'is_stable', 'n/a')}")


if __name__ == "__main__":
    main()

Mojibake repair¶

#!/usr/bin/env python3
"""Mojibake repair after wrong decoding (repair, repair_bytes)."""
from __future__ import annotations

from bytesense import repair, repair_bytes


def main() -> None:
    # UTF-8 text wrongly interpreted as Latin-1 produces mojibake like "Ã©" for "é"
    garbled = "café".encode("utf-8").decode("latin_1")
    print("garbled string:", repr(garbled))
    out = repair(garbled)
    print("repair(str):", repr(out))

    raw = garbled.encode("latin_1")
    rb = repair_bytes(raw)
    print("repair_bytes improved:", rb.improved)
    print("repaired text:", repr(rb.repaired))


if __name__ == "__main__":
    main()

HTTP + HTML/XML hints¶

#!/usr/bin/env python3
"""Extract charset hints from HTTP headers and from HTML/XML content."""
from __future__ import annotations

from bytesense import best_hint, hint_from_content, hint_from_http_headers


def main() -> None:
    headers = {
        "Content-Type": "text/html; charset=windows-1254",
    }
    h1 = hint_from_http_headers(headers)
    print("hint_from_http_headers:", h1)

    html = b"""<!doctype html><html><head>
<meta charset="utf-8">
<title>x</title></head><body></body></html>"""
    h2 = hint_from_content(html)
    print("hint_from_content:", h2)

    print("best_hint:", best_hint(html, headers=headers))


if __name__ == "__main__":
    main()

Multi-encoding documents¶

#!/usr/bin/env python3
"""Mixed-encoding documents: segment-wise detection (detect_multi)."""
from __future__ import annotations

from bytesense import detect_multi


def main() -> None:
    english = ("Hello world. " * 80).encode("utf-8")
    russian = ("Привет, мир. " * 40).encode("cp1251")
    data = english + russian
    result = detect_multi(data, segment_size=2048, min_segment_bytes=64)
    print(f"uniform={result.is_uniform}  segments={len(result.segments)}")
    for seg in result.segments:
        snippet = seg.text[:48].replace("\n", " ")
        print(f"  [{seg.start}:{seg.end}] {seg.encoding!r}  {snippet!r}...")


if __name__ == "__main__":
    main()