Created
April 13, 2021 14:21
-
-
Save jjfiv/cdd43ec1d162d8d51f31ed008e69386b to your computer and use it in GitHub Desktop.
Update v2->v3 qrel/topic files for TREC News
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# v3_ids.gz | |
# gzipped list of document ids in v3 collection | |
python3 wapo-print-docids.py $COLLECTION | gzip > v3_ids.gz | |
# data/dup-pairs.gz | |
cat wapo-near-duplicates | awk '{print $1, $2}' > dup-pairs | |
gzip dup-pairs |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gzip | |
import json | |
from tqdm import tqdm | |
from collections import Counter | |
needs_consideration = set([]) | |
for qrel_file in [ | |
"newsir18-background-linking.qrel", | |
"newsir19-background-linking.qrel", | |
"newsir20-background-linking.qrel", | |
]: | |
with open("data/queries/{}".format(qrel_file)) as fp: | |
for line in fp: | |
[qid, skip, docid, rel] = line.strip().split() | |
needs_consideration.add(docid) | |
print("Loaded qrel-ids: {}".format(len(needs_consideration))) | |
for top_file in [ | |
"newsir18-background-linking-topics.v2.xml", | |
"newsir19-background-linking-topics.xml", | |
"newsir20-topics.xml", | |
]: | |
with open("data/queries/{}".format(top_file)) as fp: | |
for line in fp: | |
if "<docid>" not in line: | |
continue | |
docid = line.removeprefix("<docid>").removesuffix("</docid>").strip() | |
needs_consideration.add(docid) | |
print("Loaded topic-ids: {}".format(len(needs_consideration))) | |
N = 671_947 | |
in_ids = set([]) | |
with gzip.open("data/v3_ids.gz", "rt") as fp: | |
for line in tqdm(fp, total=N): | |
in_ids.add(line.strip()) | |
outside_ids = set([]) | |
outside_to_inside = {} | |
def contains_doc(docid: str) -> bool: | |
if docid in in_ids: | |
return True | |
if docid in outside_ids: | |
return False | |
outside_ids.add(docid) | |
return False | |
x = Counter() | |
with gzip.open("data/dup-pairs.gz", "rt") as fp: | |
for line in tqdm(fp, total=749889): | |
[lhs, rhs] = line.strip().split() | |
left = contains_doc(lhs) | |
right = contains_doc(rhs) | |
# both still in v3: | |
if left and right: | |
x["both"] += 1 | |
continue | |
assert left | |
# left, but not right: | |
if left: | |
outside_to_inside[rhs] = lhs | |
outside_ids.remove(rhs) | |
print(len(outside_to_inside)) | |
print(len(outside_ids)) | |
with open("outside_to_inside.json", "w") as out: | |
json.dump(outside_to_inside, out) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
translations = {} | |
with open("outside_to_inside.json") as fp: | |
translations = json.load(fp) | |
def translate(docid: str) -> str: | |
return translations.get(docid, docid) | |
for qrel_file in [ | |
"newsir18-background-linking.exp", | |
"newsir19-background-linking", | |
"newsir20-background-linking", | |
]: | |
out_file = qrel_file.removesuffix('.exp') | |
with open("data/queries/{}.v3.qrel".format(out_file), "w") as out: | |
with open("data/queries/{}.qrel".format(qrel_file)) as fp: | |
for line in fp: | |
[qid, skip, docid, rel] = line.strip().split() | |
print("{} {} {} {}".format(qid, skip, translate(docid), rel), file=out) | |
for top_file in [ | |
"newsir18-background-linking-topics.v2", | |
"newsir19-background-linking-topics", | |
"newsir20-topics", | |
]: | |
out_name = top_file.removesuffix(".v2") | |
keep_lines = [] | |
found_change = False | |
with open("data/queries/{}.xml".format(top_file)) as fp: | |
for line in fp: | |
if "<docid>" not in line: | |
keep_lines.append(line.rstrip()) | |
else: | |
docid = ( | |
line.strip() | |
.removesuffix("</docid>") | |
.removeprefix("<docid>") | |
.strip() | |
) | |
new_docid = translate(docid) | |
if new_docid != docid: | |
found_change = True | |
keep_lines.append("<docid> {} </docid>".format(new_docid)) | |
if found_change: | |
with open("data/queries/{}.v3.xml".format(out_name), "w") as out: | |
for line in keep_lines: | |
print(line, file=out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment