From 8450791c5099149a245dbb9afa16e13035a72590 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Sat, 22 Feb 2025 21:02:23 +0500 Subject: [PATCH] Migrate to ruff. --- .bumpversion.cfg | 7 -- .coveragerc | 4 - .flake8 | 15 --- .pre-commit-config.yaml | 25 ++--- pyproject.toml | 145 ++++++++++++++++++++++++++++- setup.cfg | 8 -- setup.py | 5 +- src/protego.py | 64 ++++++------- tests/fetch_robotstxt.py | 21 +++-- tests/test_on_fetched_robotstxt.py | 28 +++--- tests/test_on_google_spec.py | 12 +-- tests/test_protego.py | 128 +++++++++---------------- tests/test_unquote.py | 37 ++++---- tox.ini | 2 - 14 files changed, 266 insertions(+), 235 deletions(-) delete mode 100644 .bumpversion.cfg delete mode 100644 .coveragerc delete mode 100644 .flake8 delete mode 100644 setup.cfg diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index 72dffe4..0000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,7 +0,0 @@ -[bumpversion] -current_version = 0.4.0 -commit = True -tag = True -tag_name = {new_version} - -[bumpversion:file:setup.py] diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 584a9e7..0000000 --- a/.coveragerc +++ /dev/null @@ -1,4 +0,0 @@ -[run] -branch = true -include = src/* -omit = tests/* diff --git a/.flake8 b/.flake8 deleted file mode 100644 index dbfa735..0000000 --- a/.flake8 +++ /dev/null @@ -1,15 +0,0 @@ -[flake8] -ignore = - # Refers to the max-line length. Let's suppress the error and simply - # let black take care on how it wants to format the lines. - E501, - - # Refers to "line break before binary operator". - # Similar to above, let black take care of the formatting. - W503, - - # Refers to "Unnecessary dict call - rewrite as a literal". - C408 -per-file-ignores = - # Ignore: "Found for loop that reassigns the iterable it is iterating with each iterable value" - src/protego.py:430:21:B020 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5398be8..d3e17be 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,20 +1,7 @@ repos: - - hooks: - - id: black - repo: https://github.com/psf/black - rev: 24.10.0 - - hooks: - - id: isort - language_version: python3 - repo: https://github.com/timothycrosley/isort - rev: 5.13.2 - - hooks: - - id: flake8 - language_version: python3 - additional_dependencies: - - flake8-bugbear - - flake8-comprehensions - - flake8-debugger - - flake8-string-format - repo: https://github.com/pycqa/flake8 - rev: 7.1.1 +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.7 + hooks: + - id: ruff + args: [ --fix ] + - id: ruff-format diff --git a/pyproject.toml b/pyproject.toml index aa7e766..6f39f9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,142 @@ -[tool.isort] -profile = "black" -multi_line_output = 3 +[tool.bumpversion] +current_version = "0.4.0" +commit = true +tag = true +tag_name = "{new_version}" + +[[tool.bumpversion.files]] +filename = "setup.py" + +[tool.coverage.run] +branch = true + +[tool.coverage.report] +exclude_also = [ + "if TYPE_CHECKING:", +] + +[tool.ruff.lint] +extend-select = [ + # flake8-bugbear + "B", + # flake8-comprehensions + "C4", + # pydocstyle + "D", + # flake8-future-annotations + "FA", + # flynt + "FLY", + # refurb + "FURB", + # isort + "I", + # flake8-implicit-str-concat + "ISC", + # flake8-logging + "LOG", + # Perflint + "PERF", + # pygrep-hooks + "PGH", + # flake8-pie + "PIE", + # pylint + "PL", + # flake8-use-pathlib + "PTH", + # flake8-pyi + "PYI", + # flake8-quotes + "Q", + # flake8-return + "RET", + # flake8-raise + "RSE", + # Ruff-specific rules + "RUF", + # flake8-bandit + "S", + # flake8-simplify + "SIM", + # flake8-slots + "SLOT", + # flake8-debugger + "T10", + # flake8-type-checking + "TC", + # pyupgrade + "UP", + # pycodestyle warnings + "W", + # flake8-2020 + "YTT", +] +ignore = [ + # Missing docstring in public module + "D100", + # Missing docstring in public class + "D101", + # Missing docstring in public method + "D102", + # Missing docstring in public function + "D103", + # Missing docstring in public package + "D104", + # Missing docstring in magic method + "D105", + # Missing docstring in public nested class + "D106", + # Missing docstring in __init__ + "D107", + # One-line docstring should fit on one line with quotes + "D200", + # No blank lines allowed after function docstring + "D202", + # 1 blank line required between summary line and description + "D205", + # Multi-line docstring closing quotes should be on a separate line + "D209", + # First line should end with a period + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + # First line should not be the function's "signature" + "D402", + # First word of the first line should be properly capitalized + "D403", + # `try`-`except` within a loop incurs performance overhead + "PERF203", + # Too many return statements + "PLR0911", + # Too many branches + "PLR0912", + # Too many arguments in function definition + "PLR0913", + # Too many statements + "PLR0915", + # Magic value used in comparison + "PLR2004", + # `for` loop variable `line` overwritten by assignment target + "PLW2901", + # String contains ambiguous {}. + "RUF001", + # Docstring contains ambiguous {}. + "RUF002", + # Comment contains ambiguous {}. + "RUF003", + # Mutable class attributes should be annotated with `typing.ClassVar` + "RUF012", + # Use of `assert` detected + "S101", + + # to be done when adding type hints + # Use `typing.NamedTuple` instead of `collections.namedtuple` + "PYI024", +] + +[tool.ruff.lint.pydocstyle] +convention = "pep257" + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["S"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 0f6b27e..0000000 --- a/setup.cfg +++ /dev/null @@ -1,8 +0,0 @@ -[bdist_wheel] -universal = 1 - -[metadata] -description_file = README.rst - -[aliases] -test=pytest tests diff --git a/setup.py b/setup.py index b5604bc..d410276 100755 --- a/setup.py +++ b/setup.py @@ -1,11 +1,12 @@ -#!/usr/bin/env python +from pathlib import Path + from setuptools import find_packages, setup setup( name="Protego", version="0.4.0", description="Pure-Python robots.txt parser with support for modern conventions", - long_description=open("README.rst").read(), + long_description=Path("README.rst").read_text(encoding="utf-8"), long_description_content_type="text/x-rst", url="https://github.com/scrapy/protego", author="Anubhav Patel", diff --git a/src/protego.py b/src/protego.py index 0d330f0..b2969fb 100644 --- a/src/protego.py +++ b/src/protego.py @@ -32,7 +32,7 @@ _HEX_DIGITS = set("0123456789ABCDEFabcdef") -__all__ = ["RequestRate", "Protego"] +__all__ = ["Protego", "RequestRate"] def _is_valid_directive_field(field): @@ -49,7 +49,7 @@ def _is_valid_directive_field(field): ) -class _URLPattern(object): +class _URLPattern: """Internal class which represents a URL pattern.""" def __init__(self, pattern): @@ -96,11 +96,10 @@ def _prepare_pattern_for_regex(self, pattern): s[index] = re.escape(substr) elif s[index] == "*": s[index] = ".*?" - pattern = "".join(s) - return pattern + return "".join(s) -class _RuleSet(object): +class _RuleSet: """Internal class which stores rules for a user agent.""" def __init__(self, parser_instance): @@ -131,23 +130,21 @@ def hex_to_byte(h): # ignore contains %xy escapes for characters that are not # meant to be converted back. - ignore = {"{ord_c:02X}".format(ord_c=ord(c)) for c in ignore} + ignore = {f"{ord(c):02X}" for c in ignore} parts = url.split("%") parts[0] = parts[0].encode("utf-8") for i in range(1, len(parts)): - if len(parts[i]) >= 2: - # %xy is a valid escape only if x and y are hexadecimal digits. - if set(parts[i][:2]).issubset(_HEX_DIGITS): - # make sure that all %xy escapes are in uppercase. - hexcode = parts[i][:2].upper() - leftover = parts[i][2:] - if hexcode not in ignore: - parts[i] = hex_to_byte(hexcode) + leftover.encode("utf-8") - continue - else: - parts[i] = hexcode + leftover + # %xy is a valid escape only if x and y are hexadecimal digits. + if len(parts[i]) >= 2 and set(parts[i][:2]).issubset(_HEX_DIGITS): + # make sure that all %xy escapes are in uppercase. + hexcode = parts[i][:2].upper() + leftover = parts[i][2:] + if hexcode not in ignore: + parts[i] = hex_to_byte(hexcode) + leftover.encode("utf-8") + continue + parts[i] = hexcode + leftover # add back the '%' we removed during splitting. parts[i] = b"%" + parts[i].encode("utf-8") @@ -158,8 +155,8 @@ def hexescape(self, char): """Escape char as RFC 2396 specifies""" hex_repr = hex(ord(char))[2:].upper() if len(hex_repr) == 1: - hex_repr = "0%s" % hex_repr - return "%" + hex_repr + hex_repr = f"0{hex_repr}" + return f"%{hex_repr}" def _quote_path(self, path): """Return percent encoded path.""" @@ -172,7 +169,7 @@ def _quote_path(self, path): return path or "/" def _quote_pattern(self, pattern): - if pattern.startswith("https://") or pattern.startswith("http://"): + if pattern.startswith(("https://", "http://")): pattern = "/" + pattern if pattern.startswith("//"): pattern = "//" + pattern @@ -191,8 +188,7 @@ def _quote_pattern(self, pattern): parts = ParseResult( "", "", pattern + last_char, parts.params, parts.query, parts.fragment ) - pattern = urlunparse(parts) - return pattern + return urlunparse(parts) def allow(self, pattern): if "$" in pattern: @@ -244,10 +240,8 @@ def crawl_delay(self, delay): except ValueError: # Value is malformed, do nothing. logger.debug( - "Malformed rule at line {line_seen} : cannot set crawl delay to '{delay}'. " - "Ignoring this rule.".format( - line_seen=self._parser_instance._total_line_seen, delay=delay - ) + f"Malformed rule at line {self._parser_instance._total_line_seen} : " + f"cannot set crawl delay to '{delay}'. Ignoring this rule." ) return @@ -285,10 +279,8 @@ def request_rate(self, value): except Exception: # Value is malformed, do nothing. logger.debug( - "Malformed rule at line {line_seen} : cannot set request rate using '{value}'. " - "Ignoring this rule.".format( - line_seen=self._parser_instance._total_line_seen, value=value - ) + f"Malformed rule at line {self._parser_instance._total_line_seen} : " + f"cannot set request rate using '{value}'. Ignoring this rule." ) return @@ -312,16 +304,14 @@ def visit_time(self, value): start_time, end_time = self._parse_time_period(value, separator=" ") except Exception: logger.debug( - "Malformed rule at line {line_seen} : cannot set visit time using '{value}'. " - "Ignoring this rule.".format( - line_seen=self._parser_instance._total_line_seen, value=value - ) + f"Malformed rule at line {self._parser_instance._total_line_seen} : " + f"cannot set visit time using '{value}'. Ignoring this rule." ) return self._visit_time = VisitTime(start_time, end_time) -class Protego(object): +class Protego: def __init__(self): # A dict mapping user agents (specified in robots.txt) to rule sets. self._user_agents = {} @@ -403,9 +393,7 @@ def _parse_robotstxt(self, content): and field not in _SITEMAP_DIRECTIVE ): logger.debug( - "Rule at line {line_seen} without any user agent to enforce it on.".format( - line_seen=self._total_line_seen - ) + f"Rule at line {self._total_line_seen} without any user agent to enforce it on." ) continue diff --git a/tests/fetch_robotstxt.py b/tests/fetch_robotstxt.py index b5dddf9..adda1d8 100644 --- a/tests/fetch_robotstxt.py +++ b/tests/fetch_robotstxt.py @@ -2,12 +2,12 @@ Usage ----- ->>> python fetch_robotstxt.py -l top-10000-websites.txt -d test_data +$ python fetch_robotstxt.py -l top-10000-websites.txt -d test_data """ import argparse -import os import sys +from pathlib import Path from urllib.parse import ParseResult, urlparse, urlunparse import scrapy @@ -20,6 +20,7 @@ action="append", dest="websites", help="Adds to the list of websites.", + type=Path, ) parser.add_argument( "-d", @@ -27,6 +28,7 @@ action="store", dest="directory", help="Directory to save robots.txt files.", + type=Path, ) args = parser.parse_args() @@ -39,23 +41,22 @@ class RobotstxtSpider(scrapy.Spider): name = "robotstxt_spider" def start_requests(self): + w: Path for w in args.websites: - if os.path.isfile(w): - with open(w, "r") as f: + if w.is_file(): + with w.open() as f: for domain in f: - domain = domain.strip() yield scrapy.Request( - url="https://{}/robots.txt".format(domain), + url=f"https://{domain.strip()}/robots.txt", callback=self.parse, errback=self.err_cb, ) def parse(self, response): filename = urlparse(response.url).netloc - if not os.path.exists(args.directory): - os.mkdir(args.directory) - with open(os.path.join(args.directory, filename), "wb") as f: - f.write(response.body) + if not args.directory.exists(): + args.directory.mkdir() + (args.directory / filename).write_bytes(response.body) def err_cb(self, failure): request = failure.request diff --git a/tests/test_on_fetched_robotstxt.py b/tests/test_on_fetched_robotstxt.py index c03caac..ec3ea68 100644 --- a/tests/test_on_fetched_robotstxt.py +++ b/tests/test_on_fetched_robotstxt.py @@ -1,29 +1,23 @@ -from os import listdir -from os.path import abspath, dirname, isfile, join +from pathlib import Path import pytest from protego import Protego -test_data_directory = join(dirname(abspath(__file__)), "test_data") -robotstxts = [ - f for f in listdir(test_data_directory) if isfile(join(test_data_directory, f)) -] +test_data_directory = Path(__file__).parent / "test_data" +robotstxts = [f for f in test_data_directory.iterdir() if f.is_file()] @pytest.mark.parametrize("path_to_robotstxt", robotstxts) def test_no_exceptions(path_to_robotstxt): try: - with open(join(test_data_directory, path_to_robotstxt), "rb") as f: - try: - content = f.read().decode("utf-8") - except UnicodeDecodeError: - # Downloaded robots.txt is malformed, ignore this - return - Protego.parse(content=content) + try: + content = path_to_robotstxt.read_bytes().decode("utf-8") + except UnicodeDecodeError: + # Downloaded robots.txt is malformed, ignore this + return + Protego.parse(content=content) except Exception as e: raise AssertionError( - "{error}. Exception raised while parsing {robots}".format( - error=e, robots=join(path_to_robotstxt, "robots.txt") - ) - ) + f"{e}. Exception raised while parsing http://{path_to_robotstxt.name}/robots.txt" + ) from e diff --git a/tests/test_on_google_spec.py b/tests/test_on_google_spec.py index 56e69e8..3e6a095 100644 --- a/tests/test_on_google_spec.py +++ b/tests/test_on_google_spec.py @@ -90,12 +90,10 @@ def test_user_agent_precedence(path, user_agent): ], ) def test_path_matching(pattern, path, match): - content = """ + content = f""" User-Agent: * disallow: {pattern} - """.format( - pattern=pattern - ) + """ rp = Protego.parse(content) assert (not rp.can_fetch(path, "*")) == match @@ -110,11 +108,9 @@ def test_path_matching(pattern, path, match): ], ) def test_record_precedence(rules, url, allowed): - content = """ + content = f""" User-Agent: * {rules} - """.format( - rules=rules - ) + """ rp = Protego.parse(content) assert rp.can_fetch(url, "*") == allowed diff --git a/tests/test_protego.py b/tests/test_protego.py index 67337a8..c4cbf82 100644 --- a/tests/test_protego.py +++ b/tests/test_protego.py @@ -9,16 +9,13 @@ class TestProtego(TestCase): def test_allowed(self): content = ( - "User-agent: * \n" - "Disallow: /disallowed \n" - "Allow: /allowed \n" - "Crawl-delay: 10" + "User-agent: * \nDisallow: /disallowed \nAllow: /allowed \nCrawl-delay: 10" ) rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/allowed", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/disallowed", "*")) - content = "User-agent: * \n" "Disallow: /d \n" "Crawl-delay: 10" + content = "User-agent: * \nDisallow: /d \nCrawl-delay: 10" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/abc/d", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/disallowed", "*")) @@ -42,105 +39,101 @@ def test_malformed_disallow(self): self.assertFalse(rp.can_fetch("https://www.site.local/six", "*")) def test_length_based_precedence(self): - content = "User-agent: * \n" "Disallow: / \n" "Allow: /page" + content = "User-agent: * \nDisallow: / \nAllow: /page" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/page", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/elsewhere", "*")) - content = "user-agent: FooBot\n" "disallow: /x/page.html\n" "allow: /x/\n" + content = "user-agent: FooBot\ndisallow: /x/page.html\nallow: /x/\n" rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) - content = "user-agent: FooBot\n" "allow: /x/page.html\n" "disallow: /x/\n" + content = "user-agent: FooBot\nallow: /x/page.html\ndisallow: /x/\n" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x/", "FooBot")) # In case of equivalent disallow and allow patterns for the same # user-agent, allow is used. - content = "user-agent: FooBot\n" "disallow: \n" "allow: \n" + content = "user-agent: FooBot\ndisallow: \nallow: \n" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) - content = "user-agent: FooBot\n" "disallow: /\n" "allow: /\n" + content = "user-agent: FooBot\ndisallow: /\nallow: /\n" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) - content = "user-agent: FooBot\n" "disallow: /x\n" "allow: /x/\n" + content = "user-agent: FooBot\ndisallow: /x\nallow: /x/\n" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x", "FooBot")) - content = ( - "user-agent: FooBot\n" "disallow: /x/page.html\n" "allow: /x/page.html\n" - ) + content = "user-agent: FooBot\ndisallow: /x/page.html\nallow: /x/page.html\n" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) - content = "user-agent: FooBot\n" "allow: /page\n" "disallow: /*.html\n" + content = "user-agent: FooBot\nallow: /page\ndisallow: /*.html\n" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/page", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/page.html", "FooBot")) - content = "user-agent: FooBot\n" "allow: /x/page.\n" "disallow: /*.html\n" + content = "user-agent: FooBot\nallow: /x/page.\ndisallow: /*.html\n" rp = Protego.parse(content=content) # Longest match wins. self.assertTrue(rp.can_fetch("http://foo.bar/x/page.html", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x/y.html", "FooBot")) - content = ( - "User-agent: *\n" "Disallow: /x/\n" "User-agent: FooBot\n" "Disallow: /y/\n" - ) + content = "User-agent: *\nDisallow: /x/\nUser-agent: FooBot\nDisallow: /y/\n" rp = Protego.parse(content=content) # Most specific group for FooBot allows implicitly /x/page. self.assertTrue(rp.can_fetch("http://foo.bar/x/page", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/y/page", "FooBot")) - content = "user-agent: FooBot\n" "allow: /p\n" "disallow: /\n" + content = "user-agent: FooBot\nallow: /p\ndisallow: /\n" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://example.com/page", "FooBot")) - content = "user-agent: FooBot\n" "allow: /folder\n" "disallow: /folder\n" + content = "user-agent: FooBot\nallow: /folder\ndisallow: /folder\n" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://example.com/folder/page", "FooBot")) - content = "user-agent: FooBot\n" "disallow: /folder\n" "allow: /folder\n" + content = "user-agent: FooBot\ndisallow: /folder\nallow: /folder\n" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://example.com/folder/page", "FooBot")) - content = "user-agent: FooBot\n" "allow: /page\n" "disallow: /*.htm\n" + content = "user-agent: FooBot\nallow: /page\ndisallow: /*.htm\n" rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("http://example.com/page.htm", "FooBot")) - content = "user-agent: FooBot\n" "allow: /$\n" "disallow: /\n" + content = "user-agent: FooBot\nallow: /$\ndisallow: /\n" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://example.com/", "FooBot")) self.assertFalse(rp.can_fetch("http://example.com/page.html", "FooBot")) def test_escaped_url(self): - content = "User-agent: * \n" "Disallow: / \n" "Allow: /a%3cd.html" + content = "User-agent: * \nDisallow: / \nAllow: /a%3cd.html" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/a