Skip to content

Instantly share code, notes, and snippets.

@jjfiv
Created April 13, 2021 14:21
Show Gist options
  • Save jjfiv/cdd43ec1d162d8d51f31ed008e69386b to your computer and use it in GitHub Desktop.
Save jjfiv/cdd43ec1d162d8d51f31ed008e69386b to your computer and use it in GitHub Desktop.
Update v2->v3 qrel/topic files for TREC News
# v3_ids.gz
# gzipped list of document ids in v3 collection
python3 wapo-print-docids.py $COLLECTION | gzip > v3_ids.gz
# data/dup-pairs.gz
cat wapo-near-duplicates | awk '{print $1, $2}' > dup-pairs
gzip dup-pairs
import gzip
import json
from tqdm import tqdm
from collections import Counter
needs_consideration = set([])
for qrel_file in [
"newsir18-background-linking.qrel",
"newsir19-background-linking.qrel",
"newsir20-background-linking.qrel",
]:
with open("data/queries/{}".format(qrel_file)) as fp:
for line in fp:
[qid, skip, docid, rel] = line.strip().split()
needs_consideration.add(docid)
print("Loaded qrel-ids: {}".format(len(needs_consideration)))
for top_file in [
"newsir18-background-linking-topics.v2.xml",
"newsir19-background-linking-topics.xml",
"newsir20-topics.xml",
]:
with open("data/queries/{}".format(top_file)) as fp:
for line in fp:
if "<docid>" not in line:
continue
docid = line.removeprefix("<docid>").removesuffix("</docid>").strip()
needs_consideration.add(docid)
print("Loaded topic-ids: {}".format(len(needs_consideration)))
N = 671_947
in_ids = set([])
with gzip.open("data/v3_ids.gz", "rt") as fp:
for line in tqdm(fp, total=N):
in_ids.add(line.strip())
outside_ids = set([])
outside_to_inside = {}
def contains_doc(docid: str) -> bool:
if docid in in_ids:
return True
if docid in outside_ids:
return False
outside_ids.add(docid)
return False
x = Counter()
with gzip.open("data/dup-pairs.gz", "rt") as fp:
for line in tqdm(fp, total=749889):
[lhs, rhs] = line.strip().split()
left = contains_doc(lhs)
right = contains_doc(rhs)
# both still in v3:
if left and right:
x["both"] += 1
continue
assert left
# left, but not right:
if left:
outside_to_inside[rhs] = lhs
outside_ids.remove(rhs)
print(len(outside_to_inside))
print(len(outside_ids))
with open("outside_to_inside.json", "w") as out:
json.dump(outside_to_inside, out)
import json
translations = {}
with open("outside_to_inside.json") as fp:
translations = json.load(fp)
def translate(docid: str) -> str:
return translations.get(docid, docid)
for qrel_file in [
"newsir18-background-linking.exp",
"newsir19-background-linking",
"newsir20-background-linking",
]:
out_file = qrel_file.removesuffix('.exp')
with open("data/queries/{}.v3.qrel".format(out_file), "w") as out:
with open("data/queries/{}.qrel".format(qrel_file)) as fp:
for line in fp:
[qid, skip, docid, rel] = line.strip().split()
print("{} {} {} {}".format(qid, skip, translate(docid), rel), file=out)
for top_file in [
"newsir18-background-linking-topics.v2",
"newsir19-background-linking-topics",
"newsir20-topics",
]:
out_name = top_file.removesuffix(".v2")
keep_lines = []
found_change = False
with open("data/queries/{}.xml".format(top_file)) as fp:
for line in fp:
if "<docid>" not in line:
keep_lines.append(line.rstrip())
else:
docid = (
line.strip()
.removesuffix("</docid>")
.removeprefix("<docid>")
.strip()
)
new_docid = translate(docid)
if new_docid != docid:
found_change = True
keep_lines.append("<docid> {} </docid>".format(new_docid))
if found_change:
with open("data/queries/{}.v3.xml".format(out_name), "w") as out:
for line in keep_lines:
print(line, file=out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment