Browse Source

skip a capsule after 5 consecutive failed requests

This state is reset after the current crawl

closes #16
master
René Wagner 7 months ago
parent
commit
0b0b33610a
  1. 1
      gus/constants.py
  2. 38
      gus/crawl.py

1
gus/constants.py

@ -5,6 +5,7 @@ STATISTICS_FILE = "statistics.csv"
DB_FILENAME = "gus.sqlite"
FEED_FILE = "feeds.txt"
MAXIMUM_REDIRECT_CHAIN_LENGTH = 5
MAXIMUM_FAILED_REQUEST_COUNT = 5
MAXIMUM_TEXT_PAGE_SIZE = 100000 # 100KB, in bytes
# default change frequencies (in hours)

38
gus/crawl.py

@ -49,7 +49,10 @@ EXCLUDED_URL_PREFIXES = [
# serving big files and slooow capsule -> takes to long to crawl
"gemini://kamalatta.ddnss.de/",
# Mastodon proxy
"gemini://mastogem.picasoft.net",
# ASCII art with emulated modem speed
"gemini://ansi.hrtk.in/",
"gemini://matrix.kiwifarms.net",
@ -58,6 +61,9 @@ EXCLUDED_URL_PREFIXES = [
"gemini://songs.zachdecook.com/song.gmi.php/",
"gemini://songs.zachdecook.com/chord.svg/",
# kwiecien gemcast
"gemini://kwiecien.us/gemcast/",
# OmarPolos BSD ports
'gemini://gemini.omarpolo.com/cgi/gempkg',
@ -82,13 +88,6 @@ EXCLUDED_URL_PREFIXES = [
"gemini://gus.guru/backlinks?",
"gemini://gus.guru/threads",
"gemini://geminispace.info/search/",
"gemini://geminispace.info/v/search/",
"gemini://geminispace.info/search?",
"gemini://geminispace.info/v/search?",
"gemini://geminispace.info/add-seed?",
"gemini://geminispace.info/backlinks?",
"gemini://geminispace.info/threads",
# Houston
"gemini://houston.coder.town/search?",
"gemini://houston.coder.town/search/",
@ -131,6 +130,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://alexschroeder.ch/do/tags",
"gemini://alexschroeder.ch/do/match",
"gemini://alexschroeder.ch/do/search",
"gemini://alexschroeder.ch:1965/do/gallery/",
# communitywiki's problematic stuff
"gemini://communitywiki.org:1966/image_external",
@ -151,6 +151,12 @@ EXCLUDED_URL_PREFIXES = [
"gemini://communitywiki.org:1966/do/match",
"gemini://communitywiki.org:1966/do/search",
# mozz mailing list linkscraper
"gemini://mozz.us/files/gemini-links.gmi",
# gemini.techrights.org
"gemini://gemini.techrights.org/",
# youtube mirror
"gemini://pon.ix.tc/cgi-bin/youtube.cgi?",
@ -452,6 +458,11 @@ def crawl_page(
):
gr = gemini_resource
url = gr.fetchable_url
if gr.normalized_host in failure_count and failure_count[gr.normalized_host] > constants.MAXIMUM_FAILED_REQUEST_COUNT:
logging.warn(
"Too much failed requests for host, skipping: %s", gus.lib.logging.strip_control_chars(url)
)
return
if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
logging.warn(
"Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url)
@ -541,8 +552,12 @@ def crawl_page(
page=page, status=0, is_different=False, timestamp=datetime.utcnow()
)
page_crawl.save()
failure_count[gr.normalized_host] = failure_count[gr.normalized_host] + 1 if gr.normalized_host in failure_count else 1
logging.debug("Failed request count for host %s is %d", gr.normalized_host, failure_count[gr.normalized_host])
return
elif response.status.startswith("4"):
failure_count[gr.normalized_host] = 0
if response.status.startswith("4"):
# temporary error status
logging.debug(
"Got temporary error: %s: %s %s",
@ -828,15 +843,16 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
global domain_hit_timings
domain_hit_timings = {}
global max_crawl_depth
max_crawl_depth = 100
max_crawl_depth = 500
global failure_count
failure_count = {}
expired_resources = [GeminiResource(url) for url in load_expired_urls()]
for resource in expired_resources:
crawl_page(resource, 0, should_check_if_expired=False)
submitted_resources = [GeminiResource(url) for url in load_seed_request_urls()]
for resource in submitted_resources:
crawl_page(resource, 0, should_check_if_expired=True)
pickle_robot_file_map(robot_file_map, index_dir)
logging.info("Finished!")

Loading…
Cancel
Save