source of geminispace.info - the search provider for gemini space
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

197 lines
7.6 KiB

from peewee import JOIN
from gus import constants
from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage
from gus.lib.gemini import GeminiResource
collapsible_log_variations = [
("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/3albums/"),
("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/hitenheroes/"),
("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/cornedbeef/"),
("gemini://gemini.circumlunar.space/~", "gemini://gemini.circumlunar.space/users/"),
("gemini://cetacean.club", "gemini://maj.kahless.cetacean.club"),
]
def find_thread_tops(resource, first_seen, page_id, content, current_chain=[]):
"""
This function will recursively walk up to the tops of all threads a given
page belongs to, then call recurse_thread on each of them to actually build
the full threads.
"""
for collapsible in collapsible_log_variations:
if resource.normalized_url.startswith(collapsible[1]):
resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):])
break
u = resource.indexable_url.rstrip("/")
parent_pages_query = Page.raw("""SELECT p_to.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
FROM page AS p_from
JOIN indexable_crawl AS ic
ON ic.page_id == p_to.id
JOIN crawl AS c
ON c.page_id == p_to.id
JOIN link as l
ON l.from_page_id == p_from.id
JOIN page as p_to
ON p_to.id == l.to_page_id
WHERE p_from.url IN (?, ?)
AND p_to.normalized_url != ?
AND c.status == 20
AND p_to.content_type LIKE 'text/%'
GROUP BY p_to.normalized_url
ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_url)
found_threadable_parents = False
for parent_page in parent_pages_query.iterator():
parent_resource = GeminiResource(parent_page.fetchable_url)
for collapsible in collapsible_log_variations:
if resource.normalized_url.startswith(collapsible[1]):
parent_resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):])
break
# Skip any parents that are already in the list of seen resources for this call
# stack - it means they're circular linking
if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
continue
if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like):
found_threadable_parents = True
find_thread_tops(
parent_resource,
parent_page.first_seen,
parent_page.id,
parent_page.content,
current_chain + [resource])
if not found_threadable_parents:
# return early if thread top already processed
try:
query = ThreadPage.select().join(Page).where(Page.url == resource.indexable_url, ThreadPage.address == "001")
query.get()
print(f"\nAlready done: {resource.fetchable_url}")
return
except ThreadPage.DoesNotExist:
pass
full_thread = recurse_thread(resource, "001", first_seen, page_id, content)
# Deduplicate
full_thread.reverse()
i = 0
while i < len(full_thread):
if any(x for x in full_thread[i+1:] if x[0].normalized_url == full_thread[i][0].normalized_url):
full_thread.pop(i)
else:
i += 1
full_thread.reverse()
thread_updated_at = max(m[2] for m in full_thread)
thread = Thread.create(updated_at=thread_updated_at)
print()
for m in full_thread:
ThreadPage.create(
thread=thread,
page_id=m[3],
address=m[1],
friendly_author=m[0].get_friendly_author(m[4]),
friendly_title=m[0].get_friendly_title(m[4]),
)
print(" -> [{:<19}] [{}] {}".format(m[1], m[2], m[0].fetchable_url))
def recurse_thread(resource, path, first_seen, page_id, content, current_chain=[]):
if not resource.is_valid or not resource.is_log_post_like:
# if not resource.is_valid:
return []
u = resource.indexable_url.rstrip("/")
from_urls = [
u,
f"{u}/",
]
for collapsible in collapsible_log_variations:
if resource.normalized_url.startswith(collapsible[1]):
new_u = collapsible[0] + resource.indexable_url[len(collapsible[1]):]
from_urls.extend([new_u, f"{new_u}/"])
break
elif resource.normalized_url.startswith(collapsible[0]):
new_u = collapsible[1] + resource.indexable_url[len(collapsible[0]):]
from_urls.extend([new_u, f"{new_u}/"])
break
children_query = Page.raw("""SELECT p_from.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
FROM page AS p_from
JOIN indexable_crawl AS ic
ON ic.page_id == p_from.id
JOIN crawl AS c
ON c.page_id == p_from.id
JOIN link as l
ON l.from_page_id == p_from.id
JOIN page as p_to
ON p_to.id == l.to_page_id
WHERE p_to.url IN (""" + ", ".join(["?" for x in range(len(from_urls))]) + """)
AND p_from.normalized_url != ?
AND c.status == 20
AND p_from.content_type LIKE 'text/%'
GROUP BY p_from.normalized_url
ORDER BY l.is_cross_host_like, first_seen ASC""", *from_urls, resource.normalized_url)
threadable_child_index = 1
new_thread_members = [(
resource,
path,
first_seen,
page_id,
content,
)]
processed_collapsed_urls = []
for child in children_query.iterator():
collapsed_url = child.fetchable_url
for collapsible in collapsible_log_variations:
if child.normalized_url.startswith(collapsible[1]):
collapsed_url = collapsible[0] + child.fetchable_url[len(collapsible[1]):]
break
if collapsed_url in processed_collapsed_urls:
continue
processed_collapsed_urls.append(collapsed_url)
child_resource = GeminiResource(collapsed_url)
if is_threadable_link(child_resource, resource, child.is_cross_host_like):
# Skip any parents that are already in the list of seen resources for this call
# stack - it means they're circular linking
if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
continue
child_path = f"{path:0>3}.{threadable_child_index:03}"
new_thread_members.extend(recurse_thread(
child_resource,
child_path,
child.first_seen,
child.id,
child.content,
current_chain + [resource]
))
threadable_child_index += 1
return new_thread_members
def is_threadable_link(r1, r2, is_cross_host_like):
return r1.is_log_post_like and r2.is_log_post_like and is_cross_host_like
def main():
db = init_db(f"index/{constants.DB_FILENAME}")
Thread.delete().execute()
ThreadPage.delete().execute()
pages_query = Page.raw("""SELECT p.*, MIN(c.timestamp) AS first_seen
FROM page AS p
JOIN indexable_crawl AS ic
ON ic.page_id == p.id
JOIN crawl AS c
ON c.page_id == p.id
LEFT JOIN threadpage AS tp
ON tp.page_id == p.id
WHERE tp.page_id IS NULL
AND c.status == 20
AND p.content_type LIKE 'text/%'
GROUP BY p.normalized_url
""")
for page in pages_query.iterator():
resource = GeminiResource(page.fetchable_url)
if resource.is_valid and resource.is_log_post_like:
find_thread_tops(resource, page.first_seen, page.id, page.content)
print("\nDone!")
if __name__ == "__main__":
main()