Browse Source

support prioritized robots.txt user-agents

Reimplement the can_fetch() function of RobotFileParser such that it
prioritizes multiple user-agents. Add unit test for said functionality
and set the user-agents this crawler uses to ["gus", "indexer", "*"] (as
they were in the past, though with bugs).

This was heavily inspired by the earlier discussion at
https://lists.sr.ht/~natpen/gus/%3C20210212070534.14511-1-rwagner%40rw-net.de%3E
master
Hannu Hartikainen 3 months ago
committed by René Wagner
parent
commit
f6bd88672e
  1. 7
      gus/crawl.py
  2. 42
      gus/lib/gemini.py
  3. 58
      tests/gus/lib/test_gemini.py

7
gus/crawl.py

@ -364,10 +364,9 @@ def crawl_page(
crawl_delay = None
if robots_file is not None:
logging.debug("Found robots.txt for %s", gr.normalized_url)
# only fetch if allowed for user-agents * and indexer
# RobotFileParser will return the higher level value (*) if
# no indexer section is found
can_fetch = robots_file.can_fetch("indexer", gr.normalized_url)
# only fetch if allowed for a matching user-agent:
# in priority order "gus" > "indexer" > "*"
can_fetch = robots_file.can_fetch_prioritized(["gus", "indexer", "*"], gr.normalized_url)
# same approach as above - last value wins
# crawl_delay = robots_file.crawl_delay("indexer")

42
gus/lib/gemini.py

@ -1,8 +1,11 @@
import re
from urllib.parse import (
quote,
unquote,
urljoin,
urlparse,
urlsplit,
urlunparse,
urlunsplit,
uses_relative,
uses_netloc,
@ -78,6 +81,45 @@ class GeminiRobotFileParser(RobotFileParser):
else:
self.parse(response.content.splitlines())
def read_from_string(self, robots_txt):
"""An utility method for writing tests"""
self.parse(robots_txt.splitlines())
def can_fetch_prioritized(self, useragents, url):
"""Given a url and prioritized list of user-agents, is fetching allowed?
Priority is with the highest priority first; eg. ["ThisIndexerBot", "generic-indexer", "generic-bot", "*"].
"""
if self.allow_all:
return True
if self.disallow_all:
return False
if not self.last_checked:
return False
parsed_url = urlparse(unquote(url))
url = urlunparse(('','',parsed_url.path, parsed_url.params,parsed_url.query, parsed_url.fragment))
url = quote(url) or "/"
def useragent_allowed(useragent):
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
return None
# map user-agents to allowances; the first non-None will be the prioritized allowance
for ua in useragents:
allowed = useragent_allowed(ua)
if allowed is not None:
return allowed
# if none of the user-agents match, check default entry
if self.default_entry:
return self.default_entry.allowance(url)
# if nothing matches, crawling is allowed
return True
class GeminiResource:
def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None):

58
tests/gus/lib/test_gemini.py

@ -1,6 +1,6 @@
import pytest
from gus.lib.gemini import GeminiResource
from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
class TestGeminiResource:
def test_extract_contained_resources(self):
@ -72,3 +72,59 @@ text
def test_is_root_like(self, test_url, expected_result):
gr = GeminiResource(test_url)
assert gr.is_root_like == expected_result
class TestGeminiRobotFileParser:
def _get_parser(self, content):
dummy_url = "gemini://dummy/robots.txt"
rp = GeminiRobotFileParser(dummy_url)
rp.read_from_string(content)
return rp
def _assert_fetchable(self, rp, url="/", fetchable=True):
useragents = ["testbot", "genericbot", "*"]
assert rp.can_fetch_prioritized(useragents, url) == fetchable
def test_empty_robots(self):
rp = self._get_parser("")
self._assert_fetchable(rp)
def test_disallow_star(self):
rp = self._get_parser("""User-agent: *
Disallow: /""")
self._assert_fetchable(rp, "/", False)
def test_allow_genericbot(self):
rp = self._get_parser("""User-agent: *
Disallow: /
User-agent: genericbot
Allow: /""")
self._assert_fetchable(rp)
def test_allow_genericbot_but_disallow_testbot(self):
rp = self._get_parser("""User-agent: genericbot
Allow: /
User-agent: testbot
Disallow: /""")
self._assert_fetchable(rp, "/", False)
def test_allow_star_but_disallow_genericbot(self):
rp = self._get_parser("""User-agent: *
Allow: /
User-agent: genericbot
Disallow: /""")
self._assert_fetchable(rp, "/", False)
def test_allow_only_testbot(self):
rp = self._get_parser("""User-agent: *
Disallow: /
User-agent: genericbot
Disallow: /
User-agent: testbot
Allow: /""")
self._assert_fetchable(rp)

Loading…
Cancel
Save