Browse Source

[crawl] Add is_cross_host_like field to db

remotes/src/master
Natalie Pendragon 1 year ago
parent
commit
c341bb82ae
  1. 1
      gus/crawl.py
  2. 5
      gus/lib/db_model.py
  3. 19
      gus/lib/gemini.py
  4. 24
      scripts/add_is_cross_host_like.py

1
gus/crawl.py

@ -277,6 +277,7 @@ def index_links(from_resource, contained_resources):
data.append({
"from_page": from_page,
"to_page": to_page,
"is_cross_host_like": Link.get_is_cross_host_like(from_resource, cr),
})
Link.insert_many(data).execute()

5
gus/lib/db_model.py

@ -47,6 +47,11 @@ class Link(Model):
from_page = ForeignKeyField(Page, backref="outbound_links", on_delete='CASCADE')
to_page = ForeignKeyField(Page, backref="backlinks", on_delete='CASCADE')
is_cross_host_like = BooleanField()
def get_is_cross_host_like(from_resource, to_resource):
return from_resource.normalized_host_like != to_resource.normalized_host_like
class Crawl(Model):
"""

19
gus/lib/gemini.py

@ -12,7 +12,8 @@ uses_relative.append("gemini")
uses_netloc.append("gemini")
LOG_LIKE_PATTERN = re.compile(".*/(gemlog|glog|starlog|pikkulog)/?$")
ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$")
ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$")
ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?")
class GeminiRobotFileParser(RobotFileParser):
def set_url(self, url):
@ -47,6 +48,7 @@ class GeminiResource():
self.fully_qualified_parent_url = fully_qualified_parent_url
self._normalized_url = None
self._normalized_host = None
self._normalized_host_like = None
self._fetchable_url = None
self._indexable_url = None
self._is_root_like = None
@ -107,6 +109,18 @@ class GeminiResource():
return self._normalized_host
def _get_normalized_host_like(self):
if not self.is_valid:
return None
if self._normalized_host_like is None:
normalized_host_like = self.normalized_host
m = ROOT_LIKE_PATTERN.match(self.urlsplit.path)
if m:
normalized_host_like += m[0].rstrip("/")
self._normalized_host_like = normalized_host_like
return self._normalized_host_like
def _get_fetchable_url(self):
if not self.is_valid:
return None
@ -146,7 +160,7 @@ class GeminiResource():
def _get_is_root_like(self):
if self._is_root_like is None:
is_root_like = False
if self.urlsplit.path == "" or self.urlsplit.path == "/" or ROOT_LIKE_PATTERN.match(self.urlsplit.path):
if self.urlsplit.path == "" or self.urlsplit.path == "/" or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path):
is_root_like = True
self._is_root_like = is_root_like
return self._is_root_like
@ -174,6 +188,7 @@ class GeminiResource():
indexable_url = property(_get_indexable_url)
is_root_like = property(_get_is_root_like)
is_log_like = property(_get_is_log_like)
normalized_host_like = property(_get_normalized_host_like)
def fetch(self):
# NB: this intentionally does NOT fetch the normalized URL, because that could

24
scripts/add_is_cross_host_like.py

@ -0,0 +1,24 @@
from gus import constants
from gus.lib.db_model import init_db, Link, Page
from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
def main():
db = init_db(f"index.new/{constants.DB_FILENAME}")
PageFrom = Page.alias()
PageTo = Page.alias()
link_query = (Link
.select(Link, PageFrom, PageTo)
.join(PageFrom, on=(Link.from_page_id == PageFrom.id))
.join(PageTo, on=(Link.to_page_id == PageTo.id)))
for link in link_query.iterator():
from_resource = GeminiResource(link.from_page.fetchable_url)
to_resource = GeminiResource(link.to_page.fetchable_url)
is_cross_host_like = Link.get_is_cross_host_like(from_resource, to_resource)
link.is_cross_host_like = is_cross_host_like
link.save()
print("[{}] {} -> {}".format("T" if is_cross_host_like else "F", from_resource.fetchable_url, to_resource.fetchable_url))
print("\nDone!")
if __name__ == "__main__":
main()
Loading…
Cancel
Save