Browse Source

more meta data for index cleanup

master
René Wagner 2 months ago
parent
commit
fa2db540f6
  1. 33
      gus/build_index.py
  2. 2
      gus/constants.py
  3. 18
      gus/crawl.py
  4. 2
      gus/excludes.py
  5. 1
      gus/lib/db_model.py
  6. 8
      gus/lib/index_statistics.py

33
gus/build_index.py

@ -79,36 +79,23 @@ AND l.is_cross_host_like == 1""",
return False
def invalidate_recent_results(index, invalidation_window):
recency_minimum = datetime.now() - timedelta(hours=invalidation_window)
pages = Page.select().where(
Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum
)
logging.debug('Invalidating %d pages %s', pages.count(), recency_minimum)
for page in pages:
index.delete_by_term("url_id", page.url)
def build_index(should_run_destructive=False, invalidation_window=0):
def build_index(should_run_destructive=False):
index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
# index_dir = constants.INDEX_DIR_NEW
db = init_db(index_dir + "/gus.sqlite")
index = search.Index(index_dir, should_run_destructive)
invalidate_recent_results(index, invalidation_window)
if (should_run_destructive):
pages = Page.raw(
"""SELECT p.* FROM page AS p
WHERE p.last_status == 20
WHERE p.last_success_status == 20
AND (p.content_type NOT LIKE 'text/%'
OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
)
else:
pages = Page.raw(
"""SELECT p.* FROM page AS p
WHERE p.last_status == 20
WHERE p.last_success_status == 20
AND (p.indexed_at IS NULL OR
p.indexed_at < p.last_crawl_success_at)
AND (p.content_type NOT LIKE 'text/%'
@ -126,14 +113,14 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA
if (should_run_destructive):
pages = Page.raw(
"""SELECT p.* FROM page AS p
WHERE p.last_status == 20
WHERE p.last_success_status == 20
AND (p.content_type NOT LIKE 'text/%'
OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PAGE_SIZE
)
else:
pages = Page.raw(
"""SELECT p.* FROM page AS p
WHERE p.last_status == 20
WHERE p.last_success_status == 20
AND (p.indexed_at IS NULL OR
p.indexed_at < p.last_crawl_success_at)
AND (p.content_type NOT LIKE 'text/%'
@ -159,7 +146,7 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA
def main():
args = parse_args()
gus.lib.logging.handle_arguments(args)
build_index(args.should_run_destructive, args.invalidation_window)
build_index(args.should_run_destructive)
def parse_args():
@ -172,14 +159,6 @@ def parse_args():
default=False,
help="create a fresh index",
)
parser.add_argument(
"--invalidation_window",
"-i",
dest="invalidation_window",
type=int,
default=0,
help="a recency window, in hours, for recently crawled pages that should be forcefully reindexed",
)
gus.lib.logging.add_arguments(parser)
args = parser.parse_args()
return args

2
gus/constants.py

@ -6,7 +6,7 @@ DB_FILENAME = "gus.sqlite"
FEED_FILE = "feeds.txt"
MAXIMUM_REDIRECT_CHAIN_LENGTH = 5
MAXIMUM_FAILED_REQUEST_COUNT = 5
MAXIMUM_TEXT_PAGE_SIZE = 5120000 # 1000KB, in bytes
MAXIMUM_TEXT_PAGE_SIZE = 5120000 # in bytes
# default change frequencies (in hours)
ROOT_CHANGE_FREQUENCY_DEFAULT = 24

18
gus/crawl.py

@ -49,7 +49,8 @@ def index_binary(resource, response):
"last_crawl_at": datetime.utcnow(),
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
"last_stats_message" : response.error_message,
"last_success_status": response.status,
"last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
existing_page = Page.get_or_none(url=resource.normalized_url)
@ -89,7 +90,8 @@ def index_redirect(resource, response):
"last_crawl_at": datetime.utcnow(),
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
"last_stats_message" : response.error_message,
"last_success_status" : response.status,
"last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
existing_page = Page.get_or_none(url=resource.normalized_url)
@ -164,7 +166,8 @@ def index_prompt(resource, response):
"last_crawl_at": datetime.utcnow(),
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
"last_stats_message" : response.error_message,
"last_success_status" : response.status,
"last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
existing_page = Page.get_or_none(url=resource.normalized_url)
@ -208,7 +211,8 @@ def index_content(resource, response):
"last_crawl_at": datetime.utcnow(),
"last_crawl_success_at": datetime.utcnow(),
"last_status" : response.status,
"last_stats_message" : response.error_message,
"last_success_status" : response.status,
"last_status_message" : response.error_message,
"first_seen_at" : datetime.utcnow()
}
if response.content_type == "text/gemini":
@ -257,10 +261,10 @@ def should_skip(resource):
if m:
should_skip = True
except:
logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.normalized_url))
should_skip = True
logging.error("Error checking for exclude of url: %s", gus.lib.logging.strip_control_chars(resource.raw_url))
should_skip = True
return should_skip
return should_skip
def index_links(from_resource, contained_resources):

2
gus/excludes.py

@ -2,7 +2,6 @@
# prepended with the gemini:// protocol, be all lowercased, and
# not have the port specified if it is 1965.
EXCLUDED_URL_PREFIXES = [
# test and other invalid URIs
"gemini://localhost",
"gemini://example.org",
"gemini://example.com",
@ -136,6 +135,7 @@ EXCLUDED_URL_PREFIXES = [
"gemini://gemini.susa.net/cgi-bin/search?",
"gemini://gemini.susa.net/cgi-bin/twitter?",
"gemini://gemini.susa.net/cgi-bin/vim-search?",
"gemini://gemini.susa.net/cgi-bin/links_stu.lua?",
"gemini://gemini.spam.works/textfiles/",
"gemini://gemini.spam.works/mirrors/textfiles/",

1
gus/lib/db_model.py

@ -46,6 +46,7 @@ class Page(Model):
last_crawl_success_at = DateTimeField(null=True)
last_status = IntegerField(null=True)
last_status_message = TextField(null=True)
last_success_status = IntegerField(null=True)
first_seen_at = DateTimeField(null=True)
class Link(Model):

8
gus/lib/index_statistics.py

@ -10,11 +10,11 @@ from gus.lib.db_model import Page
def compute_index_statistics(db):
page_count = len(Page.raw("""SELECT DISTINCT p.id
FROM page AS p
WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts())
WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts())
domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port
FROM page AS p
WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""")
WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""")
domains = []
for d in domains_query.execute():
s = d.domain
@ -32,12 +32,12 @@ WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL""")
content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count'
FROM page AS p
WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL
WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL
GROUP BY p.content_type
ORDER BY 2 desc""").dicts())
charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count'
FROM page AS p
WHERE last_status == 20 AND last_crawl_success_at IS NOT NULL AND p.charset IS NOT NULL
WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL AND p.charset IS NOT NULL
GROUP BY upper(p.charset)
ORDER BY 2 desc""").dicts())
index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar()

Loading…
Cancel
Save