Skip to content

Commit

Permalink
Migrate tests from unittest to pure pytest. (#238)
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR authored Feb 16, 2025
1 parent 6caf279 commit cc15435
Show file tree
Hide file tree
Showing 7 changed files with 777 additions and 871 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ coverage.xml
/index.txt
.dmypy.json
.hypothesis/
.idea/
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ filename = "docs/conf.py"

[tool.coverage.run]
branch = true
include = ["w3lib/*"]

[tool.coverage.report]
exclude_also = [
Expand Down Expand Up @@ -54,6 +53,7 @@ disable = [
"raise-missing-from",
"redefined-builtin",
"redefined-outer-name",
"too-few-public-methods",
"too-many-arguments",
"too-many-branches",
"too-many-lines",
Expand Down Expand Up @@ -91,6 +91,8 @@ extend-select = [
"PIE",
# pylint
"PL",
# flake8-pytest-style
"PT",
# flake8-use-pathlib
"PTH",
# flake8-pyi
Expand Down
86 changes: 42 additions & 44 deletions tests/test_encoding.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import codecs
import unittest
from typing import Any

from w3lib.encoding import (
Expand All @@ -14,7 +13,7 @@
)


class RequestEncodingTests(unittest.TestCase):
class TestRequestEncoding:
utf8_fragments = [
# Content-Type as meta http-equiv
b"""<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""",
Expand Down Expand Up @@ -44,84 +43,84 @@ def test_bom(self):
assert bom_encoding is not None
assert bom is not None
decoded = string[len(bom) :].decode(bom_encoding)
self.assertEqual(water_unicode, decoded)
assert water_unicode == decoded
# Body without BOM
enc, bom = read_bom(b"foo")
self.assertEqual(enc, None)
self.assertEqual(bom, None)
assert enc is None
assert bom is None
# Empty body
enc, bom = read_bom(b"")
self.assertEqual(enc, None)
self.assertEqual(bom, None)
assert enc is None
assert bom is None

def test_http_encoding_header(self):
header_value = "Content-Type: text/html; charset=ISO-8859-4"
extracted = http_content_type_encoding(header_value)
self.assertEqual(extracted, "iso8859-4")
self.assertEqual(None, http_content_type_encoding("something else"))
assert extracted == "iso8859-4"
assert http_content_type_encoding("something else") is None

def test_html_body_declared_encoding(self):
for fragment in self.utf8_fragments:
encoding = html_body_declared_encoding(fragment)
self.assertEqual(encoding, "utf-8", fragment)
self.assertEqual(None, html_body_declared_encoding(b"something else"))
self.assertEqual(
None,
assert encoding == "utf-8", fragment
assert None is html_body_declared_encoding(b"something else")
assert (
html_body_declared_encoding(
b"""
<head></head><body>
this isn't searched
<meta charset="utf-8">
"""
),
)
is None
)
self.assertEqual(
None,
assert (
html_body_declared_encoding(
b"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""
),
)
is None
)

def test_html_body_declared_encoding_unicode(self):
# html_body_declared_encoding should work when unicode body is passed
self.assertEqual(None, html_body_declared_encoding("something else"))
assert html_body_declared_encoding("something else") is None

for fragment in self.utf8_fragments:
encoding = html_body_declared_encoding(fragment.decode("utf8"))
self.assertEqual(encoding, "utf-8", fragment)
assert encoding == "utf-8", fragment

self.assertEqual(
None,
assert (
html_body_declared_encoding(
"""
<head></head><body>
this isn't searched
<meta charset="utf-8">
"""
),
)
is None
)
self.assertEqual(
None,
assert (
html_body_declared_encoding(
"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""
),
)
is None
)


class CodecsEncodingTestCase(unittest.TestCase):
class TestCodecsEncoding:
def test_resolve_encoding(self):
self.assertEqual(resolve_encoding("latin1"), "cp1252")
self.assertEqual(resolve_encoding(" Latin-1"), "cp1252")
self.assertEqual(resolve_encoding("gb_2312-80"), "gb18030")
self.assertEqual(resolve_encoding("unknown encoding"), None)
assert resolve_encoding("latin1") == "cp1252"
assert resolve_encoding(" Latin-1") == "cp1252"
assert resolve_encoding("gb_2312-80") == "gb18030"
assert resolve_encoding("unknown encoding") is None


class UnicodeDecodingTestCase(unittest.TestCase):
class TestUnicodeDecoding:
def test_utf8(self):
self.assertEqual(to_unicode(b"\xc2\xa3", "utf-8"), "\xa3")
assert to_unicode(b"\xc2\xa3", "utf-8") == "\xa3"

def test_invalid_utf8(self):
self.assertEqual(to_unicode(b"\xc2\xc2\xa3", "utf-8"), "\ufffd\xa3")
assert to_unicode(b"\xc2\xc2\xa3", "utf-8") == "\ufffd\xa3"


def ct(charset: str | None) -> str | None:
Expand All @@ -132,14 +131,14 @@ def norm_encoding(enc: str) -> str:
return codecs.lookup(enc).name


class HtmlConversionTests(unittest.TestCase):
class TestHtmlConversion:
def test_unicode_body(self):
unicode_string = "\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442"
original_string = unicode_string.encode("cp1251")
encoding, body_unicode = html_to_unicode(ct("cp1251"), original_string)
# check body_as_unicode
self.assertTrue(isinstance(body_unicode, str))
self.assertEqual(body_unicode, unicode_string)
assert isinstance(body_unicode, str)
assert body_unicode == unicode_string

def _assert_encoding(
self,
Expand All @@ -150,15 +149,14 @@ def _assert_encoding(
) -> None:
assert not isinstance(body, str)
encoding, body_unicode = html_to_unicode(ct(content_type), body)
self.assertTrue(isinstance(body_unicode, str))
self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))
assert isinstance(body_unicode, str)
assert norm_encoding(encoding) == norm_encoding(expected_encoding)

if isinstance(expected_unicode, str):
self.assertEqual(body_unicode, expected_unicode)
assert body_unicode == expected_unicode
else:
self.assertTrue(
body_unicode in expected_unicode,
f"{body_unicode} is not in {expected_unicode}",
assert body_unicode in expected_unicode, (
f"{body_unicode} is not in {expected_unicode}"
)

def test_content_type_and_conversion(self):
Expand Down Expand Up @@ -227,8 +225,8 @@ def _assert_encoding_detected(
) -> None:
assert not isinstance(body, str)
encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs)
self.assertTrue(isinstance(body_unicode, str))
self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))
assert isinstance(body_unicode, str)
assert norm_encoding(encoding) == norm_encoding(expected_encoding)

def test_BOM(self):
# utf-16 cases already tested, as is the BOM detection function
Expand Down
Loading

0 comments on commit cc15435

Please sign in to comment.