Browse Source

add some forbidden URIs & set max_crawl_depth

master
René Wagner 8 months ago
parent
commit
af967cc728
  1. 60
      gus/crawl.py

60
gus/crawl.py

@ -40,15 +40,35 @@ EXCLUDED_URL_PREFIXES = [
"gemini://localhost",
"gemini://example.org",
"gemini://example.com",
"gemini://www.youtube.com/",
"gemini://gemini.conman.org/test",
"gemini://gemini.circumlunar.space/users/fgaz/calculator/",
"gemini://gemini.bortzmeyer.org/rfc-mirror/",
# all combinations of a tictactoe board
"gemini://tictactoe.lanterne.chilliet.eu",
# serving big files and slooow capsule -> takes to long to crawl
"gemini://kamalatta.ddnss.de/",
# ASCII art with emulated modem speed
"gemini://ansi.hrtk.in/",
"gemini://matrix.kiwifarms.net",
# ZachDeCooks songs
"gemini://songs.zachdecook.com/song.gmi.php/",
"gemini://songs.zachdecook.com/chord.svg/",
# OmarPolos BSD ports
'gemini://gemini.omarpolo.com/cgi/gempkg',
# breaks crawl due to recursion overflow
"gemini://cadence.moe/chapo/",
"gemini://nixo.xyz/reply/",
"gemini://nixo.xyz/notify",
"gemini://blah.com/",
"gemini://gemini.thebackupbox.net/queryresponse",
"gemini://gem.garichankar.com/share_audio",
# various failing resources on runjimmyrunrunyoufuckerrun.com
"gemini://runjimmyrunrunyoufuckerrun.com/fonts/",
"gemini://runjimmyrunrunyoufuckerrun.com/tmp/",
@ -62,12 +82,12 @@ EXCLUDED_URL_PREFIXES = [
"gemini://gus.guru/backlinks?",
"gemini://gus.guru/threads",
"gemini://geminispace.info/search",
"gemini://geminispace.info/v/search",
"gemini://geminispace.info/search",
"gemini://geminispace.info/v/search",
"gemini://geminispace.info/add-seed",
"gemini://geminispace.info/backlinks",
"gemini://geminispace.info/search/",
"gemini://geminispace.info/v/search/",
"gemini://geminispace.info/search?",
"gemini://geminispace.info/v/search?",
"gemini://geminispace.info/add-seed?",
"gemini://geminispace.info/backlinks?",
"gemini://geminispace.info/threads",
# Houston
"gemini://houston.coder.town/search?",
@ -134,6 +154,9 @@ EXCLUDED_URL_PREFIXES = [
# youtube mirror
"gemini://pon.ix.tc/cgi-bin/youtube.cgi?",
# guardian mirror
"gemini://guardian.shit.cx/",
# wikipedia proxy
"gemini://wp.pitr.ca/",
"gemini://wp.glv.one/",
@ -141,7 +164,7 @@ EXCLUDED_URL_PREFIXES = [
# client torture test
"gemini://egsam.pitr.ca/",
"gemini://egsam.glv.one/",
# mozz's chat
"gemini://chat.mozz.us/stream",
"gemini://chat.mozz.us/submit",
@ -163,6 +186,9 @@ EXCLUDED_URL_PREFIXES = [
"gemini://gemini.thebackupbox.net/radio",
"gemini://higeki.jp/radio",
# list of ~30000 stations, crawling takes too long
"gemini://gemini.tunerapp.org/stations/",
# this page inexplicably breaks both build_index, as well as elpher
# when I browse to it... I think it might have some weird encoding
# issues in its content or something, but that's a problem for a
@ -176,21 +202,11 @@ EXCLUDED_URL_PREFIXES = [
# killing crawl, I think maybe because it's too big
"gemini://gem.denarii.cloud/pichaindata.zip",
"gemini://matrix.kiwifarms.net",
# these threads seem to expire
"gemini://dioskouroi.xyz/thread",
# french news mirrors, there's just too much
"gemini://jpfox.fr/rss/",
# ZachDeCooks songs
"gemini://songs.zachdecook.com/song.gmi.php/",
"gemini://songs.zachdecook.com/chord.svg/",
# robots.txt not served correctly
"gemini://orrg.clttr.info/orrg.pl",
"gemini://gmndemo.clttr.info/orrg/orrg.pl",
]
EXCLUDED_URL_PATHS = [
@ -500,7 +516,7 @@ def crawl_page(
)
elif not crawl_delay:
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
milliseconds=500
milliseconds=300
)
else:
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
@ -806,7 +822,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
global db
db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
global robot_file_map
robot_file_map = (
{} if should_run_destructive else unpickle_robot_file_map(constants.INDEX_DIR)
@ -814,7 +830,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
global domain_hit_timings
domain_hit_timings = {}
global max_crawl_depth
max_crawl_depth = -1
max_crawl_depth = 100
expired_resources = [GeminiResource(url) for url in load_expired_urls()]
for resource in expired_resources:

Loading…
Cancel
Save