Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate to ruff. #56

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions .bumpversion.cfg

This file was deleted.

4 changes: 0 additions & 4 deletions .coveragerc

This file was deleted.

15 changes: 0 additions & 15 deletions .flake8

This file was deleted.

25 changes: 6 additions & 19 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,7 @@
repos:
- hooks:
- id: black
repo: https://github.com/psf/black
rev: 24.10.0
- hooks:
- id: isort
language_version: python3
repo: https://github.com/timothycrosley/isort
rev: 5.13.2
- hooks:
- id: flake8
language_version: python3
additional_dependencies:
- flake8-bugbear
- flake8-comprehensions
- flake8-debugger
- flake8-string-format
repo: https://github.com/pycqa/flake8
rev: 7.1.1
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.7
hooks:
- id: ruff
args: [ --fix ]
- id: ruff-format
145 changes: 142 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,142 @@
[tool.isort]
profile = "black"
multi_line_output = 3
[tool.bumpversion]
current_version = "0.4.0"
commit = true
tag = true
tag_name = "{new_version}"

[[tool.bumpversion.files]]
filename = "setup.py"

[tool.coverage.run]
branch = true

[tool.coverage.report]
exclude_also = [
"if TYPE_CHECKING:",
]

[tool.ruff.lint]
extend-select = [
# flake8-bugbear
"B",
# flake8-comprehensions
"C4",
# pydocstyle
"D",
# flake8-future-annotations
"FA",
# flynt
"FLY",
# refurb
"FURB",
# isort
"I",
# flake8-implicit-str-concat
"ISC",
# flake8-logging
"LOG",
# Perflint
"PERF",
# pygrep-hooks
"PGH",
# flake8-pie
"PIE",
# pylint
"PL",
# flake8-use-pathlib
"PTH",
# flake8-pyi
"PYI",
# flake8-quotes
"Q",
# flake8-return
"RET",
# flake8-raise
"RSE",
# Ruff-specific rules
"RUF",
# flake8-bandit
"S",
# flake8-simplify
"SIM",
# flake8-slots
"SLOT",
# flake8-debugger
"T10",
# flake8-type-checking
"TC",
# pyupgrade
"UP",
# pycodestyle warnings
"W",
# flake8-2020
"YTT",
]
ignore = [
# Missing docstring in public module
"D100",
# Missing docstring in public class
"D101",
# Missing docstring in public method
"D102",
# Missing docstring in public function
"D103",
# Missing docstring in public package
"D104",
# Missing docstring in magic method
"D105",
# Missing docstring in public nested class
"D106",
# Missing docstring in __init__
"D107",
# One-line docstring should fit on one line with quotes
"D200",
# No blank lines allowed after function docstring
"D202",
# 1 blank line required between summary line and description
"D205",
# Multi-line docstring closing quotes should be on a separate line
"D209",
# First line should end with a period
"D400",
# First line should be in imperative mood; try rephrasing
"D401",
# First line should not be the function's "signature"
"D402",
# First word of the first line should be properly capitalized
"D403",
# `try`-`except` within a loop incurs performance overhead
"PERF203",
# Too many return statements
"PLR0911",
# Too many branches
"PLR0912",
# Too many arguments in function definition
"PLR0913",
# Too many statements
"PLR0915",
# Magic value used in comparison
"PLR2004",
# `for` loop variable `line` overwritten by assignment target
"PLW2901",
# String contains ambiguous {}.
"RUF001",
# Docstring contains ambiguous {}.
"RUF002",
# Comment contains ambiguous {}.
"RUF003",
# Mutable class attributes should be annotated with `typing.ClassVar`
"RUF012",
# Use of `assert` detected
"S101",

# to be done when adding type hints
# Use `typing.NamedTuple` instead of `collections.namedtuple`
"PYI024",
]

[tool.ruff.lint.pydocstyle]
convention = "pep257"

[tool.ruff.lint.per-file-ignores]
"tests/*" = ["S"]
8 changes: 0 additions & 8 deletions setup.cfg

This file was deleted.

5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#!/usr/bin/env python
from pathlib import Path

from setuptools import find_packages, setup

setup(
name="Protego",
version="0.4.0",
description="Pure-Python robots.txt parser with support for modern conventions",
long_description=open("README.rst").read(),
long_description=Path("README.rst").read_text(encoding="utf-8"),
long_description_content_type="text/x-rst",
url="https://github.com/scrapy/protego",
author="Anubhav Patel",
Expand Down
64 changes: 26 additions & 38 deletions src/protego.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

_HEX_DIGITS = set("0123456789ABCDEFabcdef")

__all__ = ["RequestRate", "Protego"]
__all__ = ["Protego", "RequestRate"]


def _is_valid_directive_field(field):
Expand All @@ -49,7 +49,7 @@ def _is_valid_directive_field(field):
)


class _URLPattern(object):
class _URLPattern:
"""Internal class which represents a URL pattern."""

def __init__(self, pattern):
Expand Down Expand Up @@ -96,11 +96,10 @@ def _prepare_pattern_for_regex(self, pattern):
s[index] = re.escape(substr)
elif s[index] == "*":
s[index] = ".*?"
pattern = "".join(s)
return pattern
return "".join(s)


class _RuleSet(object):
class _RuleSet:
"""Internal class which stores rules for a user agent."""

def __init__(self, parser_instance):
Expand Down Expand Up @@ -131,23 +130,21 @@ def hex_to_byte(h):

# ignore contains %xy escapes for characters that are not
# meant to be converted back.
ignore = {"{ord_c:02X}".format(ord_c=ord(c)) for c in ignore}
ignore = {f"{ord(c):02X}" for c in ignore}

parts = url.split("%")
parts[0] = parts[0].encode("utf-8")

for i in range(1, len(parts)):
if len(parts[i]) >= 2:
# %xy is a valid escape only if x and y are hexadecimal digits.
if set(parts[i][:2]).issubset(_HEX_DIGITS):
# make sure that all %xy escapes are in uppercase.
hexcode = parts[i][:2].upper()
leftover = parts[i][2:]
if hexcode not in ignore:
parts[i] = hex_to_byte(hexcode) + leftover.encode("utf-8")
continue
else:
parts[i] = hexcode + leftover
# %xy is a valid escape only if x and y are hexadecimal digits.
if len(parts[i]) >= 2 and set(parts[i][:2]).issubset(_HEX_DIGITS):
# make sure that all %xy escapes are in uppercase.
hexcode = parts[i][:2].upper()
leftover = parts[i][2:]
if hexcode not in ignore:
parts[i] = hex_to_byte(hexcode) + leftover.encode("utf-8")
continue
parts[i] = hexcode + leftover

# add back the '%' we removed during splitting.
parts[i] = b"%" + parts[i].encode("utf-8")
Expand All @@ -158,8 +155,8 @@ def hexescape(self, char):
"""Escape char as RFC 2396 specifies"""
hex_repr = hex(ord(char))[2:].upper()
if len(hex_repr) == 1:
hex_repr = "0%s" % hex_repr
return "%" + hex_repr
hex_repr = f"0{hex_repr}"
return f"%{hex_repr}"

def _quote_path(self, path):
"""Return percent encoded path."""
Expand All @@ -172,7 +169,7 @@ def _quote_path(self, path):
return path or "/"

def _quote_pattern(self, pattern):
if pattern.startswith("https://") or pattern.startswith("http://"):
if pattern.startswith(("https://", "http://")):
pattern = "/" + pattern
if pattern.startswith("//"):
pattern = "//" + pattern
Expand All @@ -191,8 +188,7 @@ def _quote_pattern(self, pattern):
parts = ParseResult(
"", "", pattern + last_char, parts.params, parts.query, parts.fragment
)
pattern = urlunparse(parts)
return pattern
return urlunparse(parts)

def allow(self, pattern):
if "$" in pattern:
Expand Down Expand Up @@ -244,10 +240,8 @@ def crawl_delay(self, delay):
except ValueError:
# Value is malformed, do nothing.
logger.debug(
"Malformed rule at line {line_seen} : cannot set crawl delay to '{delay}'. "
"Ignoring this rule.".format(
line_seen=self._parser_instance._total_line_seen, delay=delay
)
f"Malformed rule at line {self._parser_instance._total_line_seen} : "
f"cannot set crawl delay to '{delay}'. Ignoring this rule."
)
return

Expand Down Expand Up @@ -285,10 +279,8 @@ def request_rate(self, value):
except Exception:
# Value is malformed, do nothing.
logger.debug(
"Malformed rule at line {line_seen} : cannot set request rate using '{value}'. "
"Ignoring this rule.".format(
line_seen=self._parser_instance._total_line_seen, value=value
)
f"Malformed rule at line {self._parser_instance._total_line_seen} : "
f"cannot set request rate using '{value}'. Ignoring this rule."
)
return

Expand All @@ -312,16 +304,14 @@ def visit_time(self, value):
start_time, end_time = self._parse_time_period(value, separator=" ")
except Exception:
logger.debug(
"Malformed rule at line {line_seen} : cannot set visit time using '{value}'. "
"Ignoring this rule.".format(
line_seen=self._parser_instance._total_line_seen, value=value
)
f"Malformed rule at line {self._parser_instance._total_line_seen} : "
f"cannot set visit time using '{value}'. Ignoring this rule."
)
return
self._visit_time = VisitTime(start_time, end_time)


class Protego(object):
class Protego:
def __init__(self):
# A dict mapping user agents (specified in robots.txt) to rule sets.
self._user_agents = {}
Expand Down Expand Up @@ -403,9 +393,7 @@ def _parse_robotstxt(self, content):
and field not in _SITEMAP_DIRECTIVE
):
logger.debug(
"Rule at line {line_seen} without any user agent to enforce it on.".format(
line_seen=self._total_line_seen
)
f"Rule at line {self._total_line_seen} without any user agent to enforce it on."
)
continue

Expand Down
Loading