Browse Source

don't persist robots.txt over multiple crawls

Instead fetch them again on every crawl run and only
cache for the the crawl session
master
René Wagner 3 months ago
parent
commit
9efd819e3e
  1. 4
      docs/handling-robots.md
  2. 23
      gus/crawl.py

4
docs/handling-robots.md

@ -8,6 +8,4 @@ GUS honors the following User-agents:
## robots.txt caching
Every fetched robots.txt is cached in `index/robot_file_map.p`, even if they were empty/missing.
To force a refetch of _all_ robots.txt for _all_ capsulses, simply delete the file named above and run a crawl.
Every fetched robots.txt is cached only for the current crawl.

23
gus/crawl.py

@ -5,7 +5,6 @@ import re
from datetime import datetime, timedelta
import os
import pathlib
import pickle
import time
from urllib.parse import urljoin, uses_relative, uses_netloc
@ -551,18 +550,6 @@ def crawl_page(
response.status,
)
def pickle_robot_file_map(robot_file_map, index_dir):
pickle.dump(robot_file_map, open(index_dir + "/robot_file_map.p", "wb"))
def unpickle_robot_file_map(index_dir):
if not os.path.isfile(index_dir + "/robot_file_map.p"):
logging.debug("Robot file cache missing")
return {}
return pickle.load(open(index_dir + "/robot_file_map.p", "rb"))
def load_expired_urls():
expired_pages = Page.raw(
"""SELECT url
@ -665,7 +652,7 @@ def recrawl_feeds():
global max_crawl_depth
max_crawl_depth = 0
global robot_file_map
robot_file_map = unpickle_robot_file_map(constants.INDEX_DIR)
robot_file_map = {}
global domain_hit_timings
domain_hit_timings = {}
@ -673,7 +660,6 @@ def recrawl_feeds():
for resource in seed_resources:
crawl_page(resource, 0)
pickle_robot_file_map(robot_file_map, index_dir)
logging.debug(
"Recrawled feeds: %s", gus.lib.logging.strip_control_chars(content_urls)
)
@ -681,8 +667,6 @@ def recrawl_feeds():
def run_crawl(should_run_destructive=False, seed_urls=[]):
# TODO: track failed domain/page attempts, and don't reattempt for 15mins
global index_dir
index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
@ -690,9 +674,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
global robot_file_map
robot_file_map = (
{} if should_run_destructive else unpickle_robot_file_map(constants.INDEX_DIR)
)
robot_file_map = {}
global domain_hit_timings
domain_hit_timings = {}
global max_crawl_depth
@ -706,7 +688,6 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
submitted_resources = [GeminiResource(url) for url in load_seed_request_urls()]
for resource in submitted_resources:
crawl_page(resource, 0, should_check_if_expired=True)
pickle_robot_file_map(robot_file_map, index_dir)
logging.info("Finished!")

Loading…
Cancel
Save