Browse Source

rewrite statistics gathering to pure sql

the peewee functions lead to a stupid error
because to much variables are generated

fixes #21
master
René Wagner 5 months ago
parent
commit
9b21f64790
  1. 8
      gus/crawl.py
  2. 45
      gus/lib/index_statistics.py
  3. 2
      infra/update_index.sh

8
gus/crawl.py

@ -831,13 +831,13 @@ def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
def recrawl_feeds():
content_urls = resolve_feed_content_urls()
global index_dir
index_dir = constants.INDEX_DIR_NEW
index_dir = constants.INDEX_DIR
global db
db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
global max_crawl_depth
max_crawl_depth = 0
global robot_file_map
robot_file_map = unpickle_robot_file_map(constants.INDEX_DIR_NEW)
robot_file_map = unpickle_robot_file_map(constants.INDEX_DIR)
global domain_hit_timings
domain_hit_timings = {}
@ -856,14 +856,14 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
# TODO: track failed domain/page attempts, and don't reattempt for 15mins
global index_dir
index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR_NEW
index_dir = constants.INDEX_DIR_NEW if should_run_destructive else constants.INDEX_DIR
pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
global db
db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
global robot_file_map
robot_file_map = (
{} if should_run_destructive else unpickle_robot_file_map(constants.INDEX_DIR_NEW)
{} if should_run_destructive else unpickle_robot_file_map(constants.INDEX_DIR)
)
global domain_hit_timings
domain_hit_timings = {}

45
gus/lib/index_statistics.py

@ -8,20 +8,19 @@ from gus.lib.db_model import Page, Crawl
def compute_index_statistics(db):
valid_page_ids_query = Page.raw("""SELECT p.id
page_count = len(Page.raw("""SELECT p.id
FROM indexable_crawl AS c
JOIN page AS p
ON p.id == c.page_id
GROUP BY p.normalized_url""")
valid_page_ids = [p.id for p in valid_page_ids_query.execute()]
page_count = len(valid_page_ids)
domains_query = (Page
.select(Page.domain, Page.port)
.where(Page.id.in_(valid_page_ids))
.distinct())
GROUP BY p.normalized_url""").dicts())
domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port
FROM indexable_crawl AS c
JOIN page AS p
ON p.id == c.page_id
""")
domains = []
for d in domains_query:
for d in domains_query.execute():
s = d.domain
if d.port != 1965:
s = f"{d.domain}:{d.port}"
@ -34,18 +33,20 @@ GROUP BY p.normalized_url""")
continue
domains.append(s)
domain_count = len(domains)
content_type_frequencies = (Page
.select(Page.content_type, fn.Count(Page.content_type).alias("count"))
.where(Page.id.in_(valid_page_ids))
.group_by(Page.content_type)
.order_by(SQL('count').desc())
.dicts())
charset_frequencies = (Page
.select(Page.charset, fn.Count(Page.charset).alias("count"))
.where(Page.id.in_(valid_page_ids), Page.charset.is_null(False))
.group_by(Page.charset)
.order_by(SQL('count').desc())
.dicts())
content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count'
FROM indexable_crawl AS c
JOIN page AS p
ON p.id == c.page_id
GROUP BY p.content_type
ORDER BY 2 desc""").dicts())
charset_frequencies = (Page.raw("""SELECT p.charset, count(p.charset) as 'count'
FROM indexable_crawl AS c
JOIN page AS p
ON p.id == c.page_id
WHERE p.charset IS NOT NULL
GROUP BY p.charset
ORDER BY 2 desc""").dicts())
index_modification_time = Crawl.select(fn.MAX(Crawl.timestamp)).scalar()
return {

2
infra/update_index.sh

@ -1,5 +1,5 @@
cp -r /home/gus/index /home/gus/index.new
/home/gus/.poetry/bin/poetry run crawl
cp -r /home/gus/index /home/gus/index.new
/home/gus/.poetry/bin/poetry run build_index
rm -rf /home/gus/index.old
rm -rf /home/gus/index.new/MAIN.tmp/

Loading…
Cancel
Save