Browse Source

robots.txt sections "*" and "indexer" are honored

We no longer use the "gus" section for ease of implementation.
It's probably barely used anyway.
master
René Wagner 8 months ago
parent
commit
8520ec533c
  1. 1
      docs/handling-robots.md
  2. 16
      gus/crawl.py

1
docs/handling-robots.md

@ -4,7 +4,6 @@ robots.txt is fetched for each (sub)domain before actually crawling the content.
GUS honors the following User-agents:
* indexer
* gus
* *
## robots.txt caching

16
gus/crawl.py

@ -491,19 +491,11 @@ def crawl_page(
crawl_delay = None
if robots_file is not None:
logging.debug("Found robots.txt for %s", gr.normalized_url)
# only fetch if both user-agents are allowed to fetch
# RobotFileParser will return the higher level value (*) if no specific
# value is found, but has no understanding the "gus" is a more specific
# form of an indexer
logging.debug("can_fetch indexer: %s",robots_file.can_fetch("indexer", gr.normalized_url))
logging.debug("can_fetch gus: %s",robots_file.can_fetch("gus", gr.normalized_url))
can_fetch = (robots_file.can_fetch("indexer", gr.normalized_url) and
robots_file.can_fetch("gus", gr.normalized_url))
# same approach as above - last value wins
crawl_delay = robots_file.crawl_delay("*")
# only fetch if allowed for user-agents * and indexer
# RobotFileParser will return the higher level value (*) if
# no indexer section is found
can_fetch = robots_file.can_fetch("indexer", gr.normalized_url)
crawl_delay = robots_file.crawl_delay("indexer")
crawl_delay = robots_file.crawl_delay("gus")
if not can_fetch:
logging.info(

Loading…
Cancel
Save