jjfiv · April 13, 2021 14:21
diff --git a/step1_collection_info.sh b/step1_collection_info.sh
 # v3_ids.gz
 # gzipped list of document ids in v3 collection
 python3 wapo-print-docids.py $COLLECTION | gzip > v3_ids.gz

 # data/dup-pairs.gz
 cat wapo-near-duplicates | awk '{print $1, $2}' > dup-pairs
 gzip dup-pairs
diff --git a/step2_identify_needs_changing.py b/step2_identify_needs_changing.py
 import gzip
 import json
 from tqdm import tqdm
 from collections import Counter

 needs_consideration = set([])
 for qrel_file in [
    "newsir18-background-linking.qrel",
    "newsir19-background-linking.qrel",
    "newsir20-background-linking.qrel",
 ]:
    with open("data/queries/{}".format(qrel_file)) as fp:
        for line in fp:
            [qid, skip, docid, rel] = line.strip().split()
            needs_consideration.add(docid)

 print("Loaded qrel-ids: {}".format(len(needs_consideration)))

 for top_file in [
    "newsir18-background-linking-topics.v2.xml",
    "newsir19-background-linking-topics.xml",
    "newsir20-topics.xml",
 ]:
    with open("data/queries/{}".format(top_file)) as fp:
        for line in fp:
            if "<docid>" not in line:
                continue
            docid = line.removeprefix("<docid>").removesuffix("</docid>").strip()
            needs_consideration.add(docid)

 print("Loaded topic-ids: {}".format(len(needs_consideration)))


 N = 671_947
 in_ids = set([])
 with gzip.open("data/v3_ids.gz", "rt") as fp:
    for line in tqdm(fp, total=N):
        in_ids.add(line.strip())

 outside_ids = set([])
 outside_to_inside = {}


 def contains_doc(docid: str) -> bool:
    if docid in in_ids:
        return True
    if docid in outside_ids:
        return False
    outside_ids.add(docid)
    return False


 x = Counter()
 with gzip.open("data/dup-pairs.gz", "rt") as fp:
    for line in tqdm(fp, total=749889):
        [lhs, rhs] = line.strip().split()
        left = contains_doc(lhs)
        right = contains_doc(rhs)
        # both still in v3:
        if left and right:
            x["both"] += 1
            continue
        assert left
        # left, but not right:
        if left:
            outside_to_inside[rhs] = lhs
            outside_ids.remove(rhs)
 print(len(outside_to_inside))
 print(len(outside_ids))

 with open("outside_to_inside.json", "w") as out:
    json.dump(outside_to_inside, out)
diff --git a/step3_update_files.py b/step3_update_files.py
 import json

 translations = {}
 with open("outside_to_inside.json") as fp:
    translations = json.load(fp)


 def translate(docid: str) -> str:
    return translations.get(docid, docid)


 for qrel_file in [
    "newsir18-background-linking.exp",
    "newsir19-background-linking",
    "newsir20-background-linking",
 ]:
    out_file = qrel_file.removesuffix('.exp')
    with open("data/queries/{}.v3.qrel".format(out_file), "w") as out:
        with open("data/queries/{}.qrel".format(qrel_file)) as fp:
            for line in fp:
                [qid, skip, docid, rel] = line.strip().split()
                print("{} {} {} {}".format(qid, skip, translate(docid), rel), file=out)

 for top_file in [
    "newsir18-background-linking-topics.v2",
    "newsir19-background-linking-topics",
    "newsir20-topics",
 ]:
    out_name = top_file.removesuffix(".v2")
    keep_lines = []
    found_change = False
    with open("data/queries/{}.xml".format(top_file)) as fp:
        for line in fp:
            if "<docid>" not in line:
                keep_lines.append(line.rstrip())
            else:
                docid = (
                    line.strip()
                    .removesuffix("</docid>")
                    .removeprefix("<docid>")
                    .strip()
                )
                new_docid = translate(docid)
                if new_docid != docid:
                    found_change = True
                keep_lines.append("<docid> {} </docid>".format(new_docid))
        if found_change:
            with open("data/queries/{}.v3.xml".format(out_name), "w") as out:
                for line in keep_lines:
                    print(line, file=out)
	# v3_ids.gz
	# gzipped list of document ids in v3 collection
	python3 wapo-print-docids.py $COLLECTION \| gzip > v3_ids.gz

	# data/dup-pairs.gz
	cat wapo-near-duplicates \| awk '{print $1, $2}' > dup-pairs
	gzip dup-pairs
	import gzip
	import json
	from tqdm import tqdm
	from collections import Counter

	needs_consideration = set([])
	for qrel_file in [
	"newsir18-background-linking.qrel",
	"newsir19-background-linking.qrel",
	"newsir20-background-linking.qrel",
	]:
	with open("data/queries/{}".format(qrel_file)) as fp:
	for line in fp:
	[qid, skip, docid, rel] = line.strip().split()
	needs_consideration.add(docid)

	print("Loaded qrel-ids: {}".format(len(needs_consideration)))

	for top_file in [
	"newsir18-background-linking-topics.v2.xml",
	"newsir19-background-linking-topics.xml",
	"newsir20-topics.xml",
	]:
	with open("data/queries/{}".format(top_file)) as fp:
	for line in fp:
	if "<docid>" not in line:
	continue
	docid = line.removeprefix("<docid>").removesuffix("</docid>").strip()
	needs_consideration.add(docid)

	print("Loaded topic-ids: {}".format(len(needs_consideration)))


	N = 671_947
	in_ids = set([])
	with gzip.open("data/v3_ids.gz", "rt") as fp:
	for line in tqdm(fp, total=N):
	in_ids.add(line.strip())

	outside_ids = set([])
	outside_to_inside = {}


	def contains_doc(docid: str) -> bool:
	if docid in in_ids:
	return True
	if docid in outside_ids:
	return False
	outside_ids.add(docid)
	return False


	x = Counter()
	with gzip.open("data/dup-pairs.gz", "rt") as fp:
	for line in tqdm(fp, total=749889):
	[lhs, rhs] = line.strip().split()
	left = contains_doc(lhs)
	right = contains_doc(rhs)
	# both still in v3:
	if left and right:
	x["both"] += 1
	continue
	assert left
	# left, but not right:
	if left:
	outside_to_inside[rhs] = lhs
	outside_ids.remove(rhs)
	print(len(outside_to_inside))
	print(len(outside_ids))

	with open("outside_to_inside.json", "w") as out:
	json.dump(outside_to_inside, out)
	import json

	translations = {}
	with open("outside_to_inside.json") as fp:
	translations = json.load(fp)


	def translate(docid: str) -> str:
	return translations.get(docid, docid)


	for qrel_file in [
	"newsir18-background-linking.exp",
	"newsir19-background-linking",
	"newsir20-background-linking",
	]:
	out_file = qrel_file.removesuffix('.exp')
	with open("data/queries/{}.v3.qrel".format(out_file), "w") as out:
	with open("data/queries/{}.qrel".format(qrel_file)) as fp:
	for line in fp:
	[qid, skip, docid, rel] = line.strip().split()
	print("{} {} {} {}".format(qid, skip, translate(docid), rel), file=out)

	for top_file in [
	"newsir18-background-linking-topics.v2",
	"newsir19-background-linking-topics",
	"newsir20-topics",
	]:
	out_name = top_file.removesuffix(".v2")
	keep_lines = []
	found_change = False
	with open("data/queries/{}.xml".format(top_file)) as fp:
	for line in fp:
	if "<docid>" not in line:
	keep_lines.append(line.rstrip())
	else:
	docid = (
	line.strip()
	.removesuffix("</docid>")
	.removeprefix("<docid>")
	.strip()
	)
	new_docid = translate(docid)
	if new_docid != docid:
	found_change = True
	keep_lines.append("<docid> {} </docid>".format(new_docid))
	if found_change:
	with open("data/queries/{}.v3.xml".format(out_name), "w") as out:
	for line in keep_lines:
	print(line, file=out)