Browse Source

Reformat code with Black

remotes/src/master
Natalie Pendragon 11 months ago
parent
commit
43397bdda3
  1. 2
      gus/__init__.py
  2. 104
      gus/build_index.py
  3. 346
      gus/crawl.py
  4. 25
      gus/lib/db_model.py
  5. 1
      gus/lib/domain.py
  6. 151
      gus/lib/gemini.py
  7. 15
      gus/lib/logging.py
  8. 25
      gus/lib/misc.py
  9. 23
      gus/lib/whoosh_extensions.py
  10. 6
      serve/constants.py
  11. 20
      serve/main.py
  12. 156
      serve/models.py
  13. 186
      serve/views.py
  14. 29
      tests/gus/lib/test_gemini.py

2
gus/__init__.py

@ -1 +1 @@
__version__ = '0.1.0'
__version__ = "0.1.0"

104
gus/build_index.py

@ -13,7 +13,11 @@ from whoosh.index import open_dir
from gus.crawl import EXCLUDED_URL_PREFIXES
from gus.lib.db_model import init_db, Page
from gus.lib.index_statistics import compute_index_statistics, persist_statistics, log_index_statistics
from gus.lib.index_statistics import (
compute_index_statistics,
persist_statistics,
log_index_statistics,
)
from gus.lib.whoosh_extensions import UrlAnalyzer
import gus.lib.logging
@ -39,42 +43,16 @@ def create_index(index_dir):
# shutil.rmtree(index_dir, ignore_errors=True)
pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
schema = Schema(
url_id=ID(
unique=True,
),
url=TEXT(
field_boost=2.0,
stored=True,
analyzer=UrlAnalyzer(),
),
url_id=ID(unique=True,),
url=TEXT(field_boost=2.0, stored=True, analyzer=UrlAnalyzer(),),
fetchable_url=STORED(),
domain=TEXT(
analyzer=UrlAnalyzer(),
),
port=NUMERIC(
int,
32,
signed=False,
stored=True,
),
content_type=TEXT(
stored=True,
),
charset=ID(
stored=True,
),
lang=ID(
stored=True,
),
content=TEXT(
analyzer=FancyAnalyzer(),
spelling=True,
stored=True,
),
prompt=TEXT(
analyzer=FancyAnalyzer(),
stored=True,
),
domain=TEXT(analyzer=UrlAnalyzer(),),
port=NUMERIC(int, 32, signed=False, stored=True,),
content_type=TEXT(stored=True,),
charset=ID(stored=True,),
lang=ID(stored=True,),
content=TEXT(analyzer=FancyAnalyzer(), spelling=True, stored=True,),
prompt=TEXT(analyzer=FancyAnalyzer(), stored=True,),
size=NUMERIC(
int,
# this means GUS will have problems indexing responses over ~2GB
@ -83,14 +61,9 @@ def create_index(index_dir):
stored=True,
),
backlink_count=NUMERIC(
int,
16, # num bits, so max value is 65k
signed=False,
stored=True,
),
indexed_at=DATETIME(
stored=True,
int, 16, signed=False, stored=True, # num bits, so max value is 65k
),
indexed_at=DATETIME(stored=True,),
)
index_storage.create_index(schema)
@ -102,16 +75,23 @@ def index_page(page, indexed_urls):
should_skip = True
break
if should_skip:
logging.debug('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(page.url))
logging.debug(
"URL prefix matches exclusion list, skipping: %s",
gus.lib.logging.strip_control_chars(page.url),
)
return False
if page.fetchable_url in indexed_urls:
logging.debug('Page already indexed, skipping: %s', gus.lib.logging.strip_control_chars(page.url))
logging.debug(
"Page already indexed, skipping: %s",
gus.lib.logging.strip_control_chars(page.url),
)
return False
logging.info("Indexing page: %s", gus.lib.logging.strip_control_chars(page.url))
u = page.url.rstrip("/")
external_backlinks = Page.raw("""SELECT p_from.url
external_backlinks = Page.raw(
"""SELECT p_from.url
FROM page AS p_from
JOIN indexable_crawl AS ic
ON ic.page_id == p_from.id
@ -121,7 +101,10 @@ JOIN page as p_to
ON p_to.id == l.to_page_id
WHERE p_to.url IN (?, ?)
AND l.is_cross_host_like == 1
GROUP BY p_from.normalized_url""", u, f"{u}/")
GROUP BY p_from.normalized_url""",
u,
f"{u}/",
)
backlink_urls = [b.url for b in external_backlinks.execute()]
backlink_count = len(backlink_urls)
@ -146,9 +129,12 @@ GROUP BY p_from.normalized_url""", u, f"{u}/")
index_writer.add_document(**document)
return True
except:
logging.warn("Failed to index page: %s", gus.lib.logging.strip_control_chars(page.url))
logging.warn(
"Failed to index page: %s", gus.lib.logging.strip_control_chars(page.url)
)
return False
def load_indexed_urls(index_dir):
indexed_urls = []
ix = open_dir(index_dir)
@ -162,7 +148,9 @@ def load_indexed_urls(index_dir):
def invalidate_recent_results(invalidation_window):
recency_minimum = datetime.now() - timedelta(hours=invalidation_window)
pages = Page.select().where(Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum)
pages = Page.select().where(
Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum
)
for page in pages:
index_writer.delete_by_term("url_id", page.url, searcher=None)
@ -183,13 +171,17 @@ def build_index(should_run_destructive=False, invalidation_window=0):
index_writer = ix.writer()
invalidate_recent_results(invalidation_window)
indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
indexed_urls = (
[] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
)
pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp
pages = Page.raw(
"""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp
FROM indexable_crawl AS c
JOIN page AS p
ON p.id == c.page_id
GROUP BY p.normalized_url""")
GROUP BY p.normalized_url"""
)
i = 0
for page in pages.iterator():
@ -202,10 +194,10 @@ GROUP BY p.normalized_url""")
# it to flush segments to disk every 5000 documents, which
# should scale well with Geminispace going forward.
if i % 5000 == 0:
logging.debug('Committing index.')
logging.debug("Committing index.")
index_writer.commit()
index_writer = ix.writer()
logging.debug('Committing index for the last time.')
logging.debug("Committing index for the last time.")
index_writer.commit()
index_statistics = compute_index_statistics(db)
@ -216,7 +208,7 @@ GROUP BY p.normalized_url""")
# shutil.rmtree(INDEX_DIR_CURRENT, ignore_errors=True)
# shutil.move(INDEX_DIR_NEW, INDEX_DIR_CURRENT)
logging.info('Finished!')
logging.info("Finished!")
def main():
@ -226,7 +218,7 @@ def main():
def parse_args():
parser = argparse.ArgumentParser(description='Crawl Geminispace.')
parser = argparse.ArgumentParser(description="Crawl Geminispace.")
parser.add_argument(
"--destructive",
"-d",

346
gus/crawl.py

@ -155,7 +155,10 @@ CRAWL_DELAYS = {
def index_binary(resource, response):
logging.debug('Indexing binary for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
logging.debug(
"Indexing binary for: %s",
gus.lib.logging.strip_control_chars(resource.indexable_url),
)
doc = {
"url": resource.indexable_url,
@ -171,15 +174,23 @@ def index_binary(resource, response):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("binary")
doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "binary")
existing_change_frequency = (
existing_page.change_frequency
or resource.get_default_change_frequency("binary")
)
doc["change_frequency"] = resource.increment_change_frequency(
existing_change_frequency, "binary"
)
page = Page(**doc)
page.save()
return page
def index_redirect(resource):
logging.debug('Indexing redirect for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
logging.debug(
"Indexing redirect for: %s",
gus.lib.logging.strip_control_chars(resource.indexable_url),
)
doc = {
"url": resource.indexable_url,
@ -192,15 +203,23 @@ def index_redirect(resource):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("redirect")
doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "redirect")
existing_change_frequency = (
existing_page.change_frequency
or resource.get_default_change_frequency("redirect")
)
doc["change_frequency"] = resource.increment_change_frequency(
existing_change_frequency, "redirect"
)
page = Page(**doc)
page.save()
return page
def index_error(resource, is_temporary):
logging.debug('Indexing error for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
logging.debug(
"Indexing error for: %s",
gus.lib.logging.strip_control_chars(resource.indexable_url),
)
category = "temp_error" if is_temporary else "perm_error"
default_change_frequency = resource.get_default_change_frequency(category)
@ -215,16 +234,22 @@ def index_error(resource, is_temporary):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
existing_change_frequency = existing_page.change_frequency or default_change_frequency
doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, category)
existing_change_frequency = (
existing_page.change_frequency or default_change_frequency
)
doc["change_frequency"] = resource.increment_change_frequency(
existing_change_frequency, category
)
page = Page(**doc)
page.save()
return page
def index_prompt(resource, response):
logging.debug('Indexing prompt for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
logging.debug(
"Indexing prompt for: %s",
gus.lib.logging.strip_control_chars(resource.indexable_url),
)
doc = {
"url": resource.indexable_url,
@ -241,15 +266,23 @@ def index_prompt(resource, response):
existing_page = Page.get_or_none(url=resource.indexable_url)
if existing_page:
doc["id"] = existing_page.id
existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("prompt")
doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "prompt")
existing_change_frequency = (
existing_page.change_frequency
or resource.get_default_change_frequency("prompt")
)
doc["change_frequency"] = resource.increment_change_frequency(
existing_change_frequency, "prompt"
)
page = Page(**doc)
page.save()
return page
def index_content(resource, response):
logging.debug('Indexing content for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url))
logging.debug(
"Indexing content for: %s",
gus.lib.logging.strip_control_chars(resource.indexable_url),
)
doc = {
"url": resource.indexable_url,
@ -264,7 +297,7 @@ def index_content(resource, response):
"change_frequency": resource.get_default_change_frequency("content"),
}
if response.content_type == "text/gemini":
doc["lang"] = response.lang or "none",
doc["lang"] = (response.lang or "none",)
existing_page = Page.get_or_none(url=resource.indexable_url)
is_different = False
if existing_page:
@ -272,10 +305,17 @@ def index_content(resource, response):
if existing_page.content:
is_different = doc["content"] != existing_page.content
if is_different:
doc["change_frequency"] = resource.get_default_change_frequency("content")
doc["change_frequency"] = resource.get_default_change_frequency(
"content"
)
else:
existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("content")
doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "content")
existing_change_frequency = (
existing_page.change_frequency
or resource.get_default_change_frequency("content")
)
doc["change_frequency"] = resource.increment_change_frequency(
existing_change_frequency, "content"
)
page = Page(**doc)
page.save()
return page, is_different
@ -301,17 +341,21 @@ def index_links(from_resource, contained_resources):
domain=cr.normalized_host,
port=cr.urlsplit.port or 1965,
)
data.append({
"from_page": from_page,
"to_page": to_page,
"is_cross_host_like": Link.get_is_cross_host_like(from_resource, cr),
})
data.append(
{
"from_page": from_page,
"to_page": to_page,
"is_cross_host_like": Link.get_is_cross_host_like(from_resource, cr),
}
)
Link.insert_many(data).execute()
def fetch_robots_file(robot_host):
robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt")
logging.info('Fetching robots file: %s', gus.lib.logging.strip_control_chars(robot_url))
logging.info(
"Fetching robots file: %s", gus.lib.logging.strip_control_chars(robot_url)
)
rp = GeminiRobotFileParser(robot_url)
rp.read()
@ -322,30 +366,52 @@ def get_robots_file(robot_host):
return robot_file_map[robot_host]
def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, redirect_chain=[]):
def crawl_page(
gemini_resource, current_depth, should_check_if_expired=True, redirect_chain=[]
):
gr = gemini_resource
url = gr.fetchable_url
if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
logging.warn('Going too deep, skipping: %s', gus.lib.logging.strip_control_chars(url))
logging.warn(
"Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url)
)
return
if not gemini_resource.is_valid:
logging.warn('Not a valid gemini resource, skipping: %s', gus.lib.logging.strip_control_chars(url))
logging.warn(
"Not a valid gemini resource, skipping: %s",
gus.lib.logging.strip_control_chars(url),
)
return
for excluded_prefix in EXCLUDED_URL_PREFIXES:
if gr.normalized_url.startswith(excluded_prefix):
logging.info('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(url))
logging.info(
"URL prefix matches exclusion list, skipping: %s",
gus.lib.logging.strip_control_chars(url),
)
return
for excluded_path in EXCLUDED_URL_PATHS:
if gr.urlsplit.path.lower().endswith(excluded_path):
logging.info('URL on exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(url))
logging.info(
"URL on exclusion list, skipping: %s",
gus.lib.logging.strip_control_chars(url),
)
return
if should_check_if_expired:
existing_page = Page.get_or_none(url=gr.indexable_url)
if existing_page and existing_page.change_frequency is not None:
most_recent_crawl = Crawl.select(peewee.fn.MAX(Crawl.timestamp)).where(Crawl.page == existing_page).scalar()
if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(hours=existing_page.change_frequency):
logging.debug('Recrawling too soon, skipping: %s', gus.lib.logging.strip_control_chars(gr.fetchable_url))
most_recent_crawl = (
Crawl.select(peewee.fn.MAX(Crawl.timestamp))
.where(Crawl.page == existing_page)
.scalar()
)
if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(
hours=existing_page.change_frequency
):
logging.debug(
"Recrawling too soon, skipping: %s",
gus.lib.logging.strip_control_chars(gr.fetchable_url),
)
return
# ROBOTS
@ -365,120 +431,188 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red
crawl_delay = robots_file.crawl_delay("gus")
if not can_fetch:
logging.debug('Blocked by robots files, skipping: %s', gus.lib.logging.strip_control_chars(url))
logging.debug(
"Blocked by robots files, skipping: %s",
gus.lib.logging.strip_control_chars(url),
)
return
# Crawl delay
if gr.normalized_host in domain_hit_timings:
if gr.normalized_host in CRAWL_DELAYS:
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=CRAWL_DELAYS[gr.normalized_host])
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
milliseconds=CRAWL_DELAYS[gr.normalized_host]
)
elif not crawl_delay:
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=500)
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
milliseconds=500
)
else:
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=crawl_delay)
next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(
milliseconds=crawl_delay
)
sleep_duration = max((next_allowed_hit - datetime.now()).total_seconds(), 0)
time.sleep(sleep_duration)
domain_hit_timings[gr.normalized_host] = datetime.now()
# Actually fetch!
logging.info('Fetching resource: %s', gus.lib.logging.strip_control_chars(url))
logging.info("Fetching resource: %s", gus.lib.logging.strip_control_chars(url))
if gr.fully_qualified_parent_url is not None:
logging.debug('with parent: %s', gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url))
logging.debug(
"with parent: %s",
gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url),
)
response = gr.fetch()
if response is None:
# problem before getting a response
logging.warn('Failed to fetch: %s', gus.lib.logging.strip_control_chars(url))
logging.warn("Failed to fetch: %s", gus.lib.logging.strip_control_chars(url))
page = index_error(gr, True)
page_crawl = Crawl(page=page,
status=0,
is_different=False,
timestamp=datetime.utcnow())
page_crawl = Crawl(
page=page, status=0, is_different=False, timestamp=datetime.utcnow()
)
page_crawl.save()
elif response.status.startswith("4"):
# temporary error status
logging.debug('Got temporary error: %s: %s %s',
gus.lib.logging.strip_control_chars(url),
response.status,
response.error_message)
logging.debug(
"Got temporary error: %s: %s %s",
gus.lib.logging.strip_control_chars(url),
response.status,
response.error_message,
)
page = index_error(gr, True)
page_crawl = Crawl(page=page,
status=response.status,
is_different=False,
error_message=response.error_message,
timestamp=datetime.utcnow())
page_crawl = Crawl(
page=page,
status=response.status,
is_different=False,
error_message=response.error_message,
timestamp=datetime.utcnow(),
)
page_crawl.save()
elif response.status.startswith("5"):
# permanent error status
logging.debug('Got permanent error: %s: %s %s',
gus.lib.logging.strip_control_chars(url),
response.status,
response.error_message)
logging.debug(
"Got permanent error: %s: %s %s",
gus.lib.logging.strip_control_chars(url),
response.status,
response.error_message,
)
page = index_error(gr, False)
page_crawl = Crawl(page=page,
status=response.status,
is_different=False,
error_message=response.error_message,
timestamp=datetime.utcnow())
page_crawl = Crawl(
page=page,
status=response.status,
is_different=False,
error_message=response.error_message,
timestamp=datetime.utcnow(),
)
page_crawl.save()
elif response.status.startswith("3"):
# redirect status
logging.debug('Got redirected: %s: %s %s',
gus.lib.logging.strip_control_chars(url),
response.status,
response.url)
logging.debug(
"Got redirected: %s: %s %s",
gus.lib.logging.strip_control_chars(url),
response.status,
response.url,
)
if len(redirect_chain) > constants.MAXIMUM_REDIRECT_CHAIN_LENGTH:
logging.info('Aborting, maximum redirect chain length reached: %s', gus.lib.logging.strip_control_chars(url))
logging.info(
"Aborting, maximum redirect chain length reached: %s",
gus.lib.logging.strip_control_chars(url),
)
return
redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host)
redirect_resource = GeminiResource(
response.url, gr.normalized_url, gr.normalized_host
)
if redirect_resource.fetchable_url == gr.fetchable_url:
logging.info('Aborting, redirecting to self: %s', gus.lib.logging.strip_control_chars(url))
logging.info(
"Aborting, redirecting to self: %s",
gus.lib.logging.strip_control_chars(url),
)
return
page = index_redirect(gr)
page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow())
page_crawl = Crawl(
page=page,
status=response.status,
is_different=False,
timestamp=datetime.utcnow(),
)
page_crawl.save()
index_links(gr, [redirect_resource])
crawl_page(redirect_resource, current_depth, should_check_if_expired=True, redirect_chain=redirect_chain + [gr.fetchable_url])
crawl_page(
redirect_resource,
current_depth,
should_check_if_expired=True,
redirect_chain=redirect_chain + [gr.fetchable_url],
)
elif response.status.startswith("1"):
# input status
logging.debug('Input requested at: %s: %s %s', gus.lib.logging.strip_control_chars(url), response.status, response.prompt)
logging.debug(
"Input requested at: %s: %s %s",
gus.lib.logging.strip_control_chars(url),
response.status,
response.prompt,
)
page = index_prompt(gr, response)
page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow())
page_crawl = Crawl(
page=page,
status=response.status,
is_different=False,
timestamp=datetime.utcnow(),
)
page_crawl.save()
elif response.status.startswith("2"):
# success status
logging.debug('Successful request: %s: %s %s', gus.lib.logging.strip_control_chars(url), response.status, response.content_type)
logging.debug(
"Successful request: %s: %s %s",
gus.lib.logging.strip_control_chars(url),
response.status,
response.content_type,
)
if response.content_type.startswith("text/"):
page, is_different = index_content(gr, response)
page_crawl = Crawl(
page=page,
status=response.status,
is_different=is_different,
timestamp=datetime.utcnow()
timestamp=datetime.utcnow(),
)
page_crawl.save()
if response.content_type != "text/gemini":
logging.debug('Content is not gemini text: %s: %s',
gus.lib.logging.strip_control_chars(url), response.content_type)
logging.debug(
"Content is not gemini text: %s: %s",
gus.lib.logging.strip_control_chars(url),
response.content_type,
)
else:
logging.debug('Got gemini text, extracting and crawling links: %s',
gus.lib.logging.strip_control_chars(url))
logging.debug(
"Got gemini text, extracting and crawling links: %s",
gus.lib.logging.strip_control_chars(url),
)
contained_resources = gr.extract_contained_resources(response.content)
index_links(gr, contained_resources)
for resource in contained_resources:
crawl_page(resource, current_depth+1, should_check_if_expired=True)
crawl_page(
resource, current_depth + 1, should_check_if_expired=True
)
else:
page = index_binary(gr, response)
page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow())
page_crawl = Crawl(
page=page,
status=response.status,
is_different=False,
timestamp=datetime.utcnow(),
)
page_crawl.save()
else:
logging.warn('Got unhandled status: %s: %s',
gus.lib.logging.strip_control_chars(url),
response.status)
logging.warn(
"Got unhandled status: %s: %s",
gus.lib.logging.strip_control_chars(url),
response.status,
)
def pickle_robot_file_map(robot_file_map, index_dir):
@ -487,13 +621,14 @@ def pickle_robot_file_map(robot_file_map, index_dir):
def unpickle_robot_file_map(index_dir):
if not os.path.isfile(index_dir + "/robot_file_map.p"):
logging.debug('Robot file cache missing')
logging.debug("Robot file cache missing")
return {}
return pickle.load(open(index_dir + "/robot_file_map.p", "rb"))
def load_expired_urls():
expired_pages = Page.raw("""SELECT url
expired_pages = Page.raw(
"""SELECT url
FROM (
SELECT p.url, p.normalized_url, p.change_frequency, MAX(c.timestamp) as timestamp
FROM page as p
@ -502,7 +637,8 @@ FROM (
GROUP BY p.url
)
WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now')
GROUP BY normalized_url;""")
GROUP BY normalized_url;"""
)
return [page.url for page in expired_pages.execute()]
@ -528,7 +664,10 @@ def load_feed_urls(filename):
def items_from_feed_string(feed_str):
feed_obj = feedparser.parse(feed_str)
feed = feed_obj.feed
return [(entry.updated_parsed, entry.link, entry.title, feed.title) for entry in feed_obj.entries]
return [
(entry.updated_parsed, entry.link, entry.title, feed.title)
for entry in feed_obj.entries
]
def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
@ -550,26 +689,29 @@ def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
now = time.time()
interval = int(now - last)
if interval < 5:
logging.warn('Declining to hit %s again after only %d seconds',
gus.lib.logging.strip_control_chars(feed_resource.normalized_host),
interval)
logging.warn(
"Declining to hit %s again after only %d seconds",
gus.lib.logging.strip_control_chars(feed_resource.normalized_host),
interval,
)
feed_urls.insert(0, feed_url)
skips += 1
if skips == len(feed_urls):
# We've hammered every server in the queue! Sleep a bit...
logging.warn('Sleeping to give all servers a rest!')
logging.warn("Sleeping to give all servers a rest!")
time.sleep(5)
continue
skips = 0
# Good to go
logging.info('Fetching feed: %s',
gus.lib.logging.strip_control_chars(feed_url))
logging.info("Fetching feed: %s", gus.lib.logging.strip_control_chars(feed_url))
try:
resp = feed_resource.fetch()
except:
logging.info('Error fetching feed, skipping: %s',
gus.lib.logging.strip_control_chars(feed_url))
logging.info(
"Error fetching feed, skipping: %s",
gus.lib.logging.strip_control_chars(feed_url),
)
continue
if resp and resp.status == "20":
last_accessed[feed_resource.normalized_host] = time.time()
@ -595,8 +737,10 @@ def recrawl_feeds():
crawl_page(resource, 0)
pickle_robot_file_map(robot_file_map, index_dir)
logging.debug('Recrawled feeds: %s', gus.lib.logging.strip_control_chars(content_urls))
logging.info('Finished!')
logging.debug(
"Recrawled feeds: %s", gus.lib.logging.strip_control_chars(content_urls)
)
logging.info("Finished!")
def run_crawl(should_run_destructive=False, seed_urls=[]):
@ -609,7 +753,9 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
global robot_file_map
robot_file_map = {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT)
robot_file_map = (
{} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT)
)
global domain_hit_timings
domain_hit_timings = {}
global max_crawl_depth
@ -623,7 +769,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
crawl_page(resource, 0, should_check_if_expired=True)
pickle_robot_file_map(robot_file_map, index_dir)
logging.info('Finished!')
logging.info("Finished!")
def main():
@ -637,7 +783,7 @@ def main():
def parse_args():
parser = argparse.ArgumentParser(description='Crawl Geminispace.')
parser = argparse.ArgumentParser(description="Crawl Geminispace.")
parser.add_argument(
"--destructive",
"-d",

25
gus/lib/db_model.py

@ -12,6 +12,7 @@ from peewee import (
from gus.lib.gemini import GeminiResource
def init_db(filename=":memory:"):
"""
Bind an SQLite database to the Peewee ORM models.
@ -20,13 +21,15 @@ def init_db(filename=":memory:"):
db = SqliteDatabase(filename)
db.bind(models)
db.create_tables(models)
db.execute_sql("""CREATE VIEW IF NOT EXISTS indexable_crawl AS
db.execute_sql(
"""CREATE VIEW IF NOT EXISTS indexable_crawl AS
SELECT c.* FROM (
SELECT crawl.*, row_number()
OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS rank
FROM crawl) AS c
WHERE c.rank < 3
AND c.status == 20;""")
AND c.status == 20;"""
)
return db
@ -46,33 +49,36 @@ class Page(Model):
lang = TextField(null=True)
content = TextField(null=True)
prompt = TextField(null=True)
size = IntegerField(null=True) # in bytes
change_frequency = IntegerField(null=True) # in hours
size = IntegerField(null=True) # in bytes
change_frequency = IntegerField(null=True) # in hours
indexed_at = DateTimeField(null=True)
class Link(Model):
"""
Hyperlinks between pages in Geminispace
"""
from_page = ForeignKeyField(Page, backref="outbound_links", on_delete='CASCADE')
to_page = ForeignKeyField(Page, backref="backlinks", on_delete='CASCADE')
from_page = ForeignKeyField(Page, backref="outbound_links", on_delete="CASCADE")
to_page = ForeignKeyField(Page, backref="backlinks", on_delete="CASCADE")
is_cross_host_like = BooleanField()
def get_is_cross_host_like(from_resource, to_resource):
return from_resource.normalized_host_like != to_resource.normalized_host_like
class Crawl(Model):
"""
Attempts to crawl a page.
"""
page = ForeignKeyField(Page, backref="crawls", on_delete='CASCADE')
page = ForeignKeyField(Page, backref="crawls", on_delete="CASCADE")
status = IntegerField()
error_message = TextField(null=True)
is_different = BooleanField()
timestamp = DateTimeField()
class Search(Model):
"""
A log of performed searches
@ -81,19 +87,22 @@ class Search(Model):
query = TextField()
timestamp = DateTimeField()
class Thread(Model):
"""
Thread definitions.
"""
updated_at = DateTimeField()
class ThreadPage(Model):
"""
Mapping table of threads to their member pages.
"""
thread = ForeignKeyField(Thread, backref="pages", on_delete="CASCADE")
page = ForeignKeyField(Page, backref="threads", on_delete='CASCADE')
page = ForeignKeyField(Page, backref="threads", on_delete="CASCADE")
address = TextField()
friendly_author = TextField()
friendly_title = TextField()

1
gus/lib/domain.py

File diff suppressed because one or more lines are too long

151
gus/lib/gemini.py

@ -1,5 +1,12 @@
import re
from urllib.parse import unquote, urljoin, urlsplit, urlunsplit, uses_relative, uses_netloc
from urllib.parse import (
unquote,
urljoin,
urlsplit,
urlunsplit,
uses_relative,
uses_netloc,
)
from urllib.robotparser import RobotFileParser
import gusmobile
@ -12,22 +19,47 @@ from gus.lib.domain import is_domain
uses_relative.append("gemini")
uses_netloc.append("gemini")
LOG_ROOT_LIKE_PATTERN = re.compile(".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", flags=re.IGNORECASE)
LOG_POST_LIKE_PATTERN = re.compile(".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", flags=re.IGNORECASE)
LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss|handlers|diagnostics)/.*|.*atom.xml$|.*gemlog.gmi$|.*index.gmi$|.*index.gemini$", flags=re.IGNORECASE)
LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile("^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE)
LOG_POST_BOSTON_LIKE_PATTERN = re.compile("^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE)
ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE)
ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE)
LOG_ROOT_LIKE_PATTERN = re.compile(
".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$",
flags=re.IGNORECASE,
)
LOG_POST_LIKE_PATTERN = re.compile(
".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)",
flags=re.IGNORECASE,
)
LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(
".*/(games|archive|archives|rss|handlers|diagnostics)/.*|.*atom.xml$|.*gemlog.gmi$|.*index.gmi$|.*index.gemini$",
flags=re.IGNORECASE,
)
LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile(
"^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE
)
LOG_POST_BOSTON_LIKE_PATTERN = re.compile(
"^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE
)
ROOT_LIKE_ONLY_PATTERN = re.compile(
"^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE
)
ROOT_LIKE_PATTERN = re.compile(
"^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE
)
PIKKULOG_LIKE_PATTERN = re.compile(".*/pikkulog/.*", flags=re.IGNORECASE)
AUTHOR_URL_PATTERN = re.compile("^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE)
AUTHOR_CONTENT_PATTERN = re.compile(".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE)
AUTHOR_URL_PATTERN = re.compile(
"^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE
)
AUTHOR_CONTENT_PATTERN = re.compile(
".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE
)
TITLE_CONTENT_PATTERN = re.compile("^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE)
TITLE_URL_PATTERN = re.compile(".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$", flags=re.IGNORECASE)
TITLE_URL_PATTERN = re.compile(
".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$",
flags=re.IGNORECASE,
)
class GeminiRobotFileParser(RobotFileParser):
def set_url(self, url):
@ -36,7 +68,6 @@ class GeminiRobotFileParser(RobotFileParser):
u, _ = GeminiResource.urlsplit_featureful(url)
self.host, self.path = u[1:3]
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
gr = GeminiResource(self.url)
@ -50,7 +81,7 @@ class GeminiRobotFileParser(RobotFileParser):
self.parse(response.content.splitlines())
class GeminiResource():
class GeminiResource:
def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None):
self.raw_url = url
self.urlsplit, self.is_relative = GeminiResource.urlsplit_featureful(
@ -80,7 +111,7 @@ class GeminiResource():
# things behind the scenes.
is_relative = False
u = urlsplit(url, 'gemini')
u = urlsplit(url, "gemini")
if u.scheme != "gemini":
return None, None
if u.hostname is None:
@ -89,9 +120,9 @@ class GeminiResource():
if parent_hostname is None:
return None, None
joined = urljoin("gemini://{}".format(parent_hostname), url)
u = urlsplit(joined, 'gemini')
u = urlsplit(joined, "gemini")
is_relative = True
else: # url does not start with /
else: # url does not start with /
# could be: blah.com/test
# could be: test
url_split = url.split("/")
@ -99,33 +130,36 @@ class GeminiResource():
# prepend with "gemini://" so built-in urlsplit will extract
# the host properly, and continue on
url = "gemini://{}".format(url)
u = urlsplit(url, 'gemini')
u = urlsplit(url, "gemini")
else:
# process relative link
if fully_qualified_parent_url is None:
return None, None
joined = urljoin(fully_qualified_parent_url, url)
u = urlsplit(joined, 'gemini')
u = urlsplit(joined, "gemini")
is_relative = True
return u, is_relative
def _get_normalized_url(self):
if not self.is_valid:
return None
if self._normalized_url is None:
self._normalized_url, self._normalized_host = self._get_normalized_url_and_host()
(
self._normalized_url,
self._normalized_host,
) = self._get_normalized_url_and_host()
return self._normalized_url
def _get_normalized_host(self):
if not self.is_valid:
return None
if self._normalized_host is None:
self._normalized_url, self._normalized_host = self._get_normalized_url_and_host()
(
self._normalized_url,
self._normalized_host,
) = self._get_normalized_url_and_host()
return self._normalized_host
def _get_normalized_host_like(self):
if not self.is_valid:
return None
@ -137,7 +171,6 @@ class GeminiResource():
self._normalized_host_like = normalized_host_like
return self._normalized_host_like
def _get_fetchable_url(self):
if not self.is_valid:
return None
@ -162,27 +195,32 @@ class GeminiResource():
self._fetchable_url = url
return self._fetchable_url
def _get_indexable_url(self):
if not self.is_valid:
return None
if self._indexable_url is None:
indexable_url = unquote(self.fetchable_url)
if self.urlsplit.port == 1965:
indexable_url = self.normalized_url.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1)
indexable_url = self.normalized_url.replace(
self.urlsplit.hostname.lower() + ":1965",
self.urlsplit.hostname.lower(),
1,
)
self._indexable_url = indexable_url
return self._indexable_url
def _get_is_root_like(self):
if self._is_root_like is None:
is_root_like = False
if self.urlsplit.path == "" or self.urlsplit.path == "/" or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path):
if (
self.urlsplit.path == ""
or self.urlsplit.path == "/"
or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path)
):
is_root_like = True
self._is_root_like = is_root_like
return self._is_root_like
def _get_is_pikkulog_like(self):
if self._is_pikkulog_like is None:
is_pikkulog_like = False
@ -192,30 +230,39 @@ class GeminiResource():
self._is_pikkulog_like = is_pikkulog_like
return self._is_pikkulog_like
def _get_is_log_root_like(self):
if self._is_log_root_like is None:
is_log_root_like = False
if self.urlsplit.path == "" or self.urlsplit.path == "/" or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path):
if (
self.urlsplit.path == ""
or self.urlsplit.path == "/"
or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path)
):
is_log_root_like = True
self._is_log_root_like = is_log_root_like
return self._is_log_root_like
def _get_is_log_post_like(self):
if self._is_log_post_like is None:
is_log_post_like = False
post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path)
post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path)
post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path)
post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(
self.urlsplit.path
)
post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(
self.urlsplit.path
)
post_boston_match = LOG_POST_BOSTON_LIKE_PATTERN.match(self.urlsplit.path)
if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match) or (self.normalized_host == "gemini.conman.org" and post_boston_match):
if (
(post_like_match and not post_like_exclusion_match)
or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match)
or (self.normalized_host == "gemini.conman.org" and post_boston_match)
):
is_log_post_like = True
self._is_log_post_like = is_log_post_like
return self._is_log_post_like
def get_friendly_author(self, content):
if not self.is_valid:
return None
@ -238,7 +285,6 @@ class GeminiResource():
friendly_author = self.normalized_host
return friendly_author
def get_friendly_title(self, content):
if not self.is_valid:
return None
@ -253,13 +299,18 @@ class GeminiResource():
# if no content match, try looking in URL
title_url_match = TITLE_URL_PATTERN.match(self.urlsplit.path)
if title_url_match:
friendly_title = title_url_match[2].replace("-", " ").replace("_", " ").strip().title()
friendly_title = (
title_url_match[2]
.replace("-", " ")
.replace("_", " ")
.strip()
.title()
)
if friendly_title is None:
# if still no match, use URL path
friendly_title = self.urlsplit.path.lstrip("/")
return friendly_title
def get_default_change_frequency(self, category):
if not self.is_valid:
return None
@ -287,7 +338,6 @@ class GeminiResource():
self._default_change_frequency = change_frequency
return self._default_change_frequency
def increment_change_frequency(self, existing_change_frequency, category):
if category == "content":
if self.is_root_like or self.is_log_root_like:
@ -309,7 +359,6 @@ class GeminiResource():
else:
raise Exception.NameError("Unrecognized resource category")
# constructed from fetchable_url
# does not matter if quoted or unquoted so I choose arbitrarily to
# standardize on unquoting it.
@ -333,15 +382,17 @@ class GeminiResource():
# and a server redirecting to the same URL _with_ a trailing slash.
return gusmobile.fetch(self.fetchable_url)
def _get_normalized_url_and_host(self):
url_normalized = unquote(self.fetchable_url.lower().rstrip("/"))
if self.urlsplit.port == 1965:
url_normalized = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1)
url_normalized = url_normalized.replace(
self.urlsplit.hostname.lower() + ":1965",
self.urlsplit.hostname.lower(),
1,
)
host_normalized = self.urlsplit.hostname.lower()
return url_normalized, host_normalized
def extract_contained_resources(self, content):
# this finds all gemini URLs within the content of a given GeminiResource and
# returns them as a list of new GeminiResources
@ -349,9 +400,13 @@ class GeminiResource():
return self.contained_resources
link_pattern = "^=>\s*(\S+)"
preformat_pattern = r'^```.*?^```'
content_without_preformat = re.sub(preformat_pattern, '', content, flags=re.DOTALL | re.MULTILINE)
probable_urls = re.findall(link_pattern, content_without_preformat, re.MULTILINE)
preformat_pattern = r"^```.*?^```"
content_without_preformat = re.sub(
preformat_pattern, "", content, flags=re.DOTALL | re.MULTILINE
)
probable_urls = re.findall(
link_pattern, content_without_preformat, re.MULTILINE
)
resources = []
for url in probable_urls:
resource = GeminiResource(

15
gus/lib/logging.py

@ -7,11 +7,11 @@ def add_arguments(parser):
"""Add arguments to the given argument argparse parser."""
parser.add_argument(
'--logging-config',
'-c',
dest='logging_ini_fname',
"--logging-config",
"-c",
dest="logging_ini_fname",
default=False,
help='Location of logging configuration file'
help="Location of logging configuration file",
)
@ -22,11 +22,10 @@ def handle_arguments(args):
if os.path.isfile(args.logging_ini_fname):
logging.config.fileConfig(args.logging_ini_fname)
else:
sys.exit('Can not find logging ini file: %s' %
args.logging_ini_fname)
sys.exit("Can not find logging ini file: %s" % args.logging_ini_fname)
elif os.path.isfile('logging.ini'):
logging.config.fileConfig('logging.ini')
elif os.path.isfile("logging.ini"):
logging.config.fileConfig("logging.ini")
def strip_control_chars(s):

25
gus/lib/misc.py

@ -8,15 +8,24 @@ License: MIT
"""
SYMBOLS = {
'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
'zetta', 'iotta'),
'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
'zebi', 'yobi'),
"customary": ("B", "K", "M", "G", "T", "P", "E", "Z", "Y"),
"customary_ext": (
"byte",
"kilo",
"mega",
"giga",
"tera",
"peta",
"exa",
"zetta",
"iotta",
),
"iec": ("Bi", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"),
"iec_ext": ("byte", "kibi", "mebi", "gibi", "tebi", "pebi", "exbi", "zebi", "yobi"),
}
def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'):
def bytes2human(n, format="%(value).1f %(symbol)s", symbols="customary"):
"""
Convert n bytes into a human readable string based on format.
symbols can be either "customary", "customary_ext", "iec" or "iec_ext",
@ -59,7 +68,7 @@ def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'):
symbols = SYMBOLS[symbols]
prefix = {}
for i, s in enumerate(symbols[1:]):
prefix[s] = 1 << (i+1)*10
prefix[s] = 1 << (i + 1) * 10
for symbol in reversed(symbols[1:]):
if n >= prefix[symbol]:
value = float(n) / prefix[symbol]

23
gus/lib/whoosh_extensions.py

@ -16,7 +16,12 @@ def UrlAnalyzer():
"""
return RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True) | IntraWordFilter() | LowercaseFilter() | StemFilter()
return (
RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True)
| IntraWordFilter()
| LowercaseFilter()
| StemFilter()
)
class GeminiFormatter(highlight.Formatter):
@ -35,7 +40,6 @@ class GeminiFormatter(highlight.Formatter):
# string
return "%s" % tokentext
def format_fragment(self, fragment, replace=False):
"""Returns a formatted version of the given text, using the "token"
objects in the given :class:`Fragment`.
@ -57,21 +61,22 @@ class GeminiFormatter(highlight.Formatter):
if t.startchar < index:
continue
if t.startchar > index:
output.append(self._text(text[index:t.startchar]))
output.append(self._text(text[index : t.startchar]))
output.append(self.format_token(text, t, replace))
index = t.endchar
output.append(self._text(text[index:fragment.endchar]))
output.append(self._text(text[index : fragment.endchar]))
output.append("...")
out_string = "".join(output)
out_string = out_string.replace("\n", " ").replace('\r', ' ')
out_string = ' '.join(out_string.split())
out_string = out_string.replace("\n", " ").replace("\r", " ")
out_string = " ".join(out_string.split())
return out_string
special_char_pattern = re.compile("[^\w\s,\.;-\?\!']")
link_pattern = re.compile("://|=>")
class GeminiScorer(highlight.FragmentScorer):
def __call__(self, f):
# Add up the boosts for the matched terms in this passage
@ -87,10 +92,12 @@ class GeminiScorer(highlight.FragmentScorer):
# ascii art, as well as source code (which, I suppose will make snippets
# lower quality for actual searches for source code, but that is a very
# small minority of searches in the current state of things).
num_special_chars = len(special_char_pattern.findall(f.text[f.startchar:f.endchar]))
num_special_chars = len(
special_char_pattern.findall(f.text[f.startchar : f.endchar])
)
score -= 4 * num_special_chars + math.pow(num_special_chars, 1.5)
num_links = len(link_pattern.findall(f.text[f.startchar:f.endchar]))
num_links = len(link_pattern.findall(f.text[f.startchar : f.endchar]))
score -= 30 * num_links
return max(