Browse Source

improve indexing speed via optimized backlinks query

the query to calculate backlinks caused massive delays during indexing.
An unused join to the `crawl` table caused this behavior.

After removing the join, speed is very fast again.
master
René Wagner 5 months ago
parent
commit
d4093761e1
  1. 8
      gus/build_index.py
  2. 5
      gus/crawl.py
  3. 8
      serve/models.py

8
gus/build_index.py

@ -44,12 +44,8 @@ def index_page(index, page, indexed_urls):
external_backlinks = Page.raw(
"""SELECT p_from.url
FROM page AS p_from
JOIN indexable_crawl AS ic
ON ic.page_id == p_from.id
JOIN link as l
ON l.from_page_id == p_from.id
JOIN page as p_to
ON p_to.id == l.to_page_id
JOIN link as l ON l.from_page_id == p_from.id
JOIN page as p_to ON p_to.id == l.to_page_id
WHERE p_to.url IN (?, ?)
AND l.is_cross_host_like == 1
GROUP BY p_from.normalized_url""",

5
gus/crawl.py

@ -109,11 +109,6 @@ def index_redirect(resource):
def index_error(resource, is_temporary):
logging.debug(
"Indexing error for: %s",
gus.lib.logging.strip_control_chars(resource.indexable_url),
)
category = "temp_error" if is_temporary else "perm_error"
default_change_frequency = resource.get_default_change_frequency(category)
doc = {

8
serve/models.py

@ -56,12 +56,8 @@ class GUS:
backlinks_query = Page.raw(
"""SELECT p_from.url, l.is_cross_host_like
FROM page AS p_from
JOIN indexable_crawl AS ic
ON ic.page_id == p_from.id
JOIN link as l
ON l.from_page_id == p_from.id
JOIN page as p_to
ON p_to.id == l.to_page_id
JOIN link as l ON l.from_page_id == p_from.id
JOIN page as p_to ON p_to.id == l.to_page_id
WHERE p_to.url IN (?, ?)
AND p_from.normalized_url != ?
GROUP BY p_from.normalized_url

Loading…
Cancel
Save