Browse Source

[crawl] Start indexing the charset

remotes/src/master
Natalie Pendragon 2 years ago
parent
commit
8a1cafaffb
  1. 6
      gus/crawl.py
  2. 34
      gus/lib/index_statistics.py
  3. 22
      gus/serve.py
  4. 2
      poetry.lock

6
gus/crawl.py

@ -145,6 +145,9 @@ def create_index(index_dir):
content_type=TEXT(
stored=True,
),
charset=ID(
stored=True,
),
content=TEXT(
analyzer=FancyAnalyzer(),
spelling=True,
@ -171,6 +174,7 @@ def index_binary(resource, response):
fetchable_url=resource.fetchable_url,
domain=resource.normalized_host,
content_type=response.content_type,
charset=response.charset or "none",
indexed_at=datetime.utcnow(),
)
index_writer.commit()
@ -187,6 +191,7 @@ def index_prompt(resource, response):
fetchable_url=resource.fetchable_url,
domain=resource.normalized_host,
content_type="input",
charset=response.charset or "none",
prompt=response.prompt,
indexed_at=datetime.utcnow(),
)
@ -204,6 +209,7 @@ def index_content(resource, response):
fetchable_url=resource.fetchable_url,
domain=resource.normalized_host,
content_type=response.content_type,
charset=response.charset or "none",
content=response.content,
regex=response.content,
indexed_at=datetime.utcnow(),

34
gus/lib/index_statistics.py

@ -11,16 +11,22 @@ from gus.lib.gemini import GeminiResource
def compute_index_statistics(index_dir):
ix = open_dir(index_dir)
# content types
content_types = None
# content types and charsets
content_types = set()
charsets = set()
with ix.reader() as reader:
all_stored_fields = reader.all_stored_fields()
content_types = set([f["content_type"] for f in all_stored_fields])
for fields in reader.all_stored_fields():
if "charset" in fields:
charset = fields["charset"]
charsets.add(charset)
content_type = fields["content_type"]
content_types.add(content_type)
# page count, domain count, content type frequencies
page_count = 0
domain_count = 0
content_type_frequencies = []
charset_frequencies = []
with ix.searcher() as searcher:
page_count = searcher.doc_count()
@ -32,6 +38,14 @@ def compute_index_statistics(index_dir):
content_type_frequencies.append((content_type, len(results)))
content_type_frequencies.sort(key=lambda pair: pair[1], reverse=True)
# charset frequencies
parser = QueryParser("charset", schema=ix.schema)
for charset in charsets:
query = parser.parse("charset:{}".format(charset))
results = searcher.search(query, limit=9999999)
charset_frequencies.append((charset, len(results)))
charset_frequencies.sort(key=lambda pair: pair[1], reverse=True)
query = Every("url")
results = searcher.search(query, limit=9999999)
domains = set()
@ -47,6 +61,7 @@ def compute_index_statistics(index_dir):
"page_count": page_count,
"domain_count": domain_count,
"content_type_frequencies": content_type_frequencies,
"charset_frequencies": charset_frequencies,
"domains": domains,
}
@ -67,10 +82,14 @@ def print_index_statistics(index_statistics, crawl_statistics):
for pair in index_statistics["content_type_frequencies"]:
print("{:>5} - {}".format(pair[1], pair[0]))
print("\nCharsets:")
for pair in index_statistics["charset_frequencies"]:
print("{:>5} - {}".format(pair[1], pair[0]))
def run_index_statistics():
index_statistics = compute_index_statistics("index")
print_index_statistics(index_statistics)
print_index_statistics(index_statistics, None)
# persist_index_statistics(index_statistics, "index-statistics.csv")
@ -80,7 +99,7 @@ def persist_statistics(index_statistics, crawl_statistics, was_destructive, file
def serialize_statistics_line(index_statistics, crawl_statistics, was_destructive):
return "{:%Y-%m-%d},{},{},{},{},{},{},{},{}\n".format(
return "{:%Y-%m-%d},{},{},{},{},{},{},{},{},{}\n".format(
index_statistics["index_modification_time"],
was_destructive,
index_statistics["page_count"],
@ -90,6 +109,7 @@ def serialize_statistics_line(index_statistics, crawl_statistics, was_destructiv
crawl_statistics["broken_url_count"],
"|".join(index_statistics["domains"]),
"|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]),
"|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["charset_frequencies"]),
)
@ -112,6 +132,7 @@ def deserialize_statistics_line(line):
broken_url_count = line_parts[6]
domains = [domain for domain in line_parts[7].split("|")]
content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[8].split("|")]
charset_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[9].split("|")]
return {
"index_modification_time": index_modification_time,
@ -122,4 +143,5 @@ def deserialize_statistics_line(line):
"broken_url_count": broken_url_count,
"domains": domains,
"content_type_frequencies": content_type_frequencies,
"charset_frequencies": charset_frequencies,
}

22
gus/serve.py

@ -40,6 +40,7 @@ def load_and_compute_statistics(filename):
statistics["page_count"] = index_statistics["page_count"]
statistics["domain_count"] = index_statistics["domain_count"]
statistics["content_type_frequencies"] = index_statistics["content_type_frequencies"]
statistics["charset_frequencies"] = index_statistics["charset_frequencies"]
statistics["domains"] = index_statistics["domains"]
return statistics
@ -83,6 +84,15 @@ def _render_index_statistics():
]
for pair in last_statistics["content_type_frequencies"]:
d.append("{:>5} - {}".format(pair[1], pair[0]))
d.extend([
"",
"## By Charset",
"",
"These figures are representative of the number of pages seen per content type at the time the current index was last updated on {:%Y-%m-%d}.".format(last_statistics["index_modification_time"]),
"",
])
for pair in last_statistics["charset_frequencies"]:
d.append("{:>5} - {}".format(pair[1], pair[0]))
return d
@ -105,6 +115,10 @@ def _render_news():
"# GUS News",
]
news_items = [
{
"date": "2020-06-03",
"content": "Added ability to search and filter by charset. Documentation for this feature can be found on the advanced searching section of the about page!",
},
{
"date": "2020-05-21",
"content": "Added ability to search and filter by domain. Documentation for this feature can be found on the advanced searching section of the about page!",
@ -188,6 +202,7 @@ def index(request):
"To improve the quality of your search results, you can apply filters to constrain your search results in various dimensions. The currently implemented filters are:",
"* content_type",
"* domain",
"* charset",
"",
"To filter by one of these, simply add it to your query followed by a colon, and the value you wish to filter by. Some examples of doing so follow.",
"",
@ -199,7 +214,10 @@ def index(request):
"=> /search?domain:circumlunar domain:circumlunar",
"=> /search?contextual%20domain:gus contextual domain:gus",
"",
"For further inspiration on how to use these filters, you can visit both GUS' list of known hosts, as well as GUS' list of known content_types on the statistics page.",
"=> /search?computers%20content_type%3Agemini%20AND%20NOT%20charset%3AUS-ASCII computers content_type:gemini AND NOT charset:US-ASCII",
"=> /search?NOT%20charset%3Anone NOT charset:none",
"",
"For further inspiration on how to use these filters, you can visit both GUS' list of known hosts, as well as GUS' list of known content_types and charsets on the statistics page. Note that there is some nuance to the charset values, due to the fact that specifying them is optional, and if one does not specify, there is a default of utf-8 - pages that do not specify a charset have an indexed charset value of \"none\".",
"",
"=> /known-hosts GUS Known Hosts (with list of domains)",
"=> /statistics GUS statistics (with list of content_types)",
@ -254,6 +272,7 @@ def _search_index(query, requested_page):
"url" : result["url"],
"fetchable_url": result["fetchable_url"],
"content_type" : result["content_type"],
"charset" : result["charset"] if "charset" in result else "none",
"prompt" : result["prompt"] if "prompt" in result else "",
"highlights" : gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "",
} for result in results
@ -300,6 +319,7 @@ def _render_results(results, verbose=False):
if verbose:
data.append("* Score : {:.2f}".format(result["score"]))
data.append("* Indexed at : {:%Y-%m-%d %H:%M}".format(result["indexed_at"]))
data.append("* Charset : {}".format(result["charset"]))
if len(result["highlights"]) > 0:
data.extend(result["highlights"].split(GeminiFormatter.between))
return data

2
poetry.lock

@ -100,7 +100,7 @@ python-versions = "*"
version = "0.1.0"
[package.source]
reference = "a252aed301aa182a19571465c725a832530f95c7"
reference = "c8867e2a90165958ae58e444791c0003329c6501"
type = "git"
url = "https://git.sr.ht/~natpen/gusmobile"
[[package]]

Loading…
Cancel
Save