Examples¶
Runnable scripts live in the repository under examples/. Install bytesense (pip install bytesense or pip install -e . from a checkout), then run e.g. python examples/basic_detect.py from the repo root.
Basic detection (from_bytes, from_path)¶
#!/usr/bin/env python3
"""Detect encoding from raw bytes and from a file (from_bytes, from_path)."""
from __future__ import annotations
from pathlib import Path
from bytesense import from_bytes, from_path
def main() -> None:
text = "Café naïve — résumé"
data = text.encode("utf-8")
r = from_bytes(data)
print("from_bytes:")
print(f" encoding={r.encoding!r} confidence={r.confidence:.3f} language={r.language!r}")
print(f" bytes={r.byte_count} bom={r.bom_detected}")
root = Path(__file__).resolve().parent.parent
readme = root / "README.md"
if readme.is_file():
r2 = from_path(readme)
print("\nfrom_path (README.md):")
print(f" encoding={r2.encoding!r} confidence={r2.confidence:.3f}")
else:
print("\n(Skip from_path: README.md not found next to repo root.)")
if __name__ == "__main__":
main()
Chardet-style detect() dict¶
#!/usr/bin/env python3
"""Drop-in style dict like chardet / charset-normalizer (detect)."""
from __future__ import annotations
from bytesense import detect
def main() -> None:
samples = [
b"ASCII only",
"Здравствуй, мир".encode("cp1251"),
"こんにちは".encode("shift_jis"),
]
for raw in samples:
d = detect(raw)
print(d)
if __name__ == "__main__":
main()
Streaming chunks (StreamDetector)¶
#!/usr/bin/env python3
"""Incremental detection with StreamDetector (chunked input)."""
from __future__ import annotations
from bytesense import StreamDetector
def main() -> None:
payload = "Streamed UTF-8: naïve café".encode("utf-8")
det = StreamDetector()
chunk_size = 5
for i in range(0, len(payload), chunk_size):
det.feed(payload[i : i + chunk_size])
det.finalize()
print(f"encoding={det.encoding!r}")
print(f"confidence={det.confidence:.4f}")
print(f"language={det.language!r}")
print(f"stable={getattr(det, 'is_stable', 'n/a')}")
if __name__ == "__main__":
main()
Mojibake repair¶
#!/usr/bin/env python3
"""Mojibake repair after wrong decoding (repair, repair_bytes)."""
from __future__ import annotations
from bytesense import repair, repair_bytes
def main() -> None:
# UTF-8 text wrongly interpreted as Latin-1 produces mojibake like "é" for "é"
garbled = "café".encode("utf-8").decode("latin_1")
print("garbled string:", repr(garbled))
out = repair(garbled)
print("repair(str):", repr(out))
raw = garbled.encode("latin_1")
rb = repair_bytes(raw)
print("repair_bytes improved:", rb.improved)
print("repaired text:", repr(rb.repaired))
if __name__ == "__main__":
main()
HTTP + HTML/XML hints¶
#!/usr/bin/env python3
"""Extract charset hints from HTTP headers and from HTML/XML content."""
from __future__ import annotations
from bytesense import best_hint, hint_from_content, hint_from_http_headers
def main() -> None:
headers = {
"Content-Type": "text/html; charset=windows-1254",
}
h1 = hint_from_http_headers(headers)
print("hint_from_http_headers:", h1)
html = b"""<!doctype html><html><head>
<meta charset="utf-8">
<title>x</title></head><body></body></html>"""
h2 = hint_from_content(html)
print("hint_from_content:", h2)
print("best_hint:", best_hint(html, headers=headers))
if __name__ == "__main__":
main()
Multi-encoding documents¶
#!/usr/bin/env python3
"""Mixed-encoding documents: segment-wise detection (detect_multi)."""
from __future__ import annotations
from bytesense import detect_multi
def main() -> None:
english = ("Hello world. " * 80).encode("utf-8")
russian = ("Привет, мир. " * 40).encode("cp1251")
data = english + russian
result = detect_multi(data, segment_size=2048, min_segment_bytes=64)
print(f"uniform={result.is_uniform} segments={len(result.segments)}")
for seg in result.segments:
snippet = seg.text[:48].replace("\n", " ")
print(f" [{seg.start}:{seg.end}] {seg.encoding!r} {snippet!r}...")
if __name__ == "__main__":
main()