Browse Source

some cleanup

- remove some unused code
- remove outdated excludes
- news update 2021-09-15
master
René Wagner 1 month ago
parent
commit
6eedbd4190
  1. 1
      .gitignore
  2. 12
      gus/crawl.py
  3. 23
      gus/excludes.py
  4. 4
      infra/rebuild_index.sh
  5. 5
      serve/templates/news.gmi

1
.gitignore

@ -149,3 +149,4 @@ dmypy.json
.vim/
.viminfo
.vimrc
.profile

12
gus/crawl.py

@ -65,10 +65,6 @@ def index_binary(resource, response):
existing_change_frequency, "binary"
)
if ((existing_page is not None and existing_page.first_seen_at is None) or
existing_page is None):
doc["fist_seen_at"] = datetime.utcnow()
page = Page(**doc)
try:
page.save()
@ -109,10 +105,6 @@ def index_redirect(resource, response):
existing_change_frequency, "redirect"
)
if ((existing_page is not None and existing_page.first_seen_at is None) or
existing_page is None):
doc["fist_seen_at"] = datetime.utcnow()
page = Page(**doc)
try:
page.save()
@ -188,10 +180,6 @@ def index_prompt(resource, response):
existing_change_frequency, "prompt"
)
if ((existing_page is not None and existing_page.first_seen_at is None) or
existing_page is None):
doc["fist_seen_at"] = datetime.utcnow()
page = Page(**doc)
try:
page.save()

23
gus/excludes.py

@ -27,9 +27,6 @@ EXCLUDED_URL_PREFIXES = [
# kwiecien gemcast
"gemini://kwiecien.us/gemcast/",
# OmarPolos BSD ports
'gemini://gemini.omarpolo.com/cgi/gempkg',
# breaks crawl due to recursion overflow
"gemini://cadence.moe/chapo/",
@ -40,13 +37,11 @@ EXCLUDED_URL_PREFIXES = [
# Mastodon mirror
"gemini://vps01.rdelaage.ovh/",
"gemini://gemini.lost-frequencies.eu/",
"gemini://mastogem.picasoft.net",
# various failing resources on runjimmyrunrunyoufuckerrun.com
"gemini://runjimmyrunrunyoufuckerrun.com/fonts/",
"gemini://runjimmyrunrunyoufuckerrun.com/tmp/",
"gemini://gemini.conman.org/boston/",
# Search providers
"gemini://houston.coder.town/search?",
@ -68,10 +63,6 @@ EXCLUDED_URL_PREFIXES = [
"gemini://caolan.uk/weather/",
# Alex Schroeder's problematic stuff
"gemini://transjovian.org/",
"gemini://alexschroeder.ch/",
"gemini://alexschroeder.ch:1967",
"gemini://alexschroeder.ch/image_external",
"gemini://alexschroeder.ch/html/",
"gemini://alexschroeder.ch/diff/",
@ -89,11 +80,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://alexschroeder.ch/do/tags",
"gemini://alexschroeder.ch/do/match",
"gemini://alexschroeder.ch/do/search",
"gemini://alexschroeder.ch:1965/do/gallery/",
# communitywiki's problematic stuff
"gemini://communitywiki.org:1966/",
"gemini://communitywiki.org/",
"gemini://alexschroeder.ch/do/gallery/",
# mozz mailing list linkscraper
"gemini://mozz.us/files/gemini-links.gmi",
@ -105,7 +92,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://pon.ix.tc/cgi-bin/youtube.cgi?",
"gemini://pon.ix.tc/youtube/",
# news mirrors - not our businessn
# news mirrors - not our business
"gemini://guardian.shit.cx/",
"gemini://simplynews.metalune.xyz",
"gemini://illegaldrugs.net/cgi-bin/news.php?",
@ -158,11 +145,6 @@ EXCLUDED_URL_PREFIXES = [
"gemini://gemini.thebackupbox.net/radio",
"gemini://higeki.jp/radio",
# list of ~30000 stations, crawling takes too long
"gemini://gemini.tunerapp.org/",
"gemini://tunerapp.org/",
"gemini://thegonz.net:3965/",
# full web proxy
"gemini://drewdevault.com/cgi-bin/web.sh?",
"gemini://gemiprox.pollux.casa/",
@ -170,6 +152,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://ecs.d2evs.net/proxy/",
# killing crawl, I think maybe because it's too big
# cryptocurrency bullshit
"gemini://gem.denarii.cloud/",
# docs - not our business

4
infra/rebuild_index.sh

@ -1,8 +1,6 @@
cp -r /home/gus/index /home/gus/index.new
#rm -rf /home/gus/index.new/MAIN*
#rm -rf /home/gus/index.new/_MAIN*
/home/gus/.poetry/bin/poetry run build_index -d
rm -rf /home/gus/index.old
rm -rf /home/gus/index.new/MAIN.tmp/
#rm -rf /home/gus/index.new/MAIN.tmp/
mv /home/gus/index /home/gus/index.old
mv /home/gus/index.new /home/gus/index

5
serve/templates/news.gmi

@ -2,6 +2,11 @@
## News
### 2021-09-15
I'm currently quite happy with the reliability and performance of the crawl and indexing processes.
So i removed some older excludes, you should expect to see a whole lot more indexed pages after the next crawl.
We'll have to see if i regret this change... ;)
### 2021-08-18
geminispace.info is now powered by Debian 11 Bullseye :)

Loading…
Cancel
Save