yuchenlin · October 30, 2021 20:51
diff --git a/stanza_parse.py b/stanza_parse.py
 """
 # stanza.download()
 # http://nlp.stanford.edu/software/stanza/1.0.0/en/default.zip

 Example usage:
 CUDA_VISIBLE_DEVICES=1 \
 python parsing.py \
    --input_corpus_path ./corpora/gkb_best_sent.txt \
    --output_json_path ./parses/gkb_best.parses.jsonl \
    --prefix gkb_best --num_shards 10000 --shard_id 0
 """

 import json
 import argparse
 import os
 import numpy as np
 from tqdm import tqdm
 import stanza
 from spacy_stanza import StanzaLanguage

 snlp = stanza.Pipeline(lang="en", processors='tokenize, pos, lemma, depparse')
 nlp = StanzaLanguage(snlp)


 def dependency_parsing(sent_id, sent):
    doc = nlp(sent)
    roots = [token for token in doc if token.head == token]
    if len(roots) != 1:
        # iff one sentence 
        return None
    root = roots[0]
    sent_dict = {'sent_id': sent_id, 'sentence': sent, 'tokens': []}
    for token in doc:
        token_dict = {'idx': token.i,     # to differentiate same words in a sentence
                  'text': token.text,
                  'pos': token.pos_,
                  'tag': token.tag_,
                  'dep': token.dep_,
                  'lemma': token.lemma_,
                  'parent': token.head.i,
                  'children': []}   
        for child in list(token.children):
            token_dict['children'].append(child.i)
        sent_dict["tokens"].append(token_dict)
    sent_dict['tokens'].sort(key=lambda token: token['idx'])
    return sent_dict


 def file_to_json(input_lines, prefix, shard_id=-1):  
    output_jsonlines = []
    for line in tqdm(input_lines, desc="shard_id:%d"%shard_id):
        sent_dict = dependency_parsing("{0}-{1}".format(prefix, len(output_jsonlines)), line)
        # print(sent_dict)
        output_jsonlines.append(json.dumps(sent_dict)) 
    return output_jsonlines


 if __name__ == "__main__":
    # modify path here: or add a for-loop to traverse the corpus
    flags_parser = argparse.ArgumentParser()
    flags_parser.add_argument('--input_corpus_path', type=str, help='the input file of the corpus, which is a list of plain sentences.')
    flags_parser.add_argument('--output_json_path', type=str, help='the output file of the parses in the jsonl format.')
    flags_parser.add_argument('--prefix', type=str, help='name of the corpus.')
    flags_parser.add_argument('--num_shards', type=int, default=0, help='number of shards')
    flags_parser.add_argument('--shard_id', type=int, default=-1, help='the current shards')

    args = flags_parser.parse_args()

    output_path = args.output_json_path
    all_lines = open(args.input_corpus_path).read().split("\n")[:-1]
    if args.num_shards > 0 and args.shard_id >=0:
        lines = list(np.array_split(all_lines, args.num_shards)[args.shard_id])
        output_path += ".%d-%d"%(args.shard_id, args.num_shards)
    else:
        lines = all_lines
    output_jsonlines = file_to_json(lines, args.prefix, args.shard_id)
    open(output_path, "w").write('\n'.join(output_jsonlines)+"\n")
	"""
	# stanza.download()
	# http://nlp.stanford.edu/software/stanza/1.0.0/en/default.zip

	Example usage:
	CUDA_VISIBLE_DEVICES=1 \
	python parsing.py \
	--input_corpus_path ./corpora/gkb_best_sent.txt \
	--output_json_path ./parses/gkb_best.parses.jsonl \
	--prefix gkb_best --num_shards 10000 --shard_id 0
	"""

	import json
	import argparse
	import os
	import numpy as np
	from tqdm import tqdm
	import stanza
	from spacy_stanza import StanzaLanguage

	snlp = stanza.Pipeline(lang="en", processors='tokenize, pos, lemma, depparse')
	nlp = StanzaLanguage(snlp)


	def dependency_parsing(sent_id, sent):
	doc = nlp(sent)
	roots = [token for token in doc if token.head == token]
	if len(roots) != 1:
	# iff one sentence
	return None
	root = roots[0]
	sent_dict = {'sent_id': sent_id, 'sentence': sent, 'tokens': []}
	for token in doc:
	token_dict = {'idx': token.i, # to differentiate same words in a sentence
	'text': token.text,
	'pos': token.pos_,
	'tag': token.tag_,
	'dep': token.dep_,
	'lemma': token.lemma_,
	'parent': token.head.i,
	'children': []}
	for child in list(token.children):
	token_dict['children'].append(child.i)
	sent_dict["tokens"].append(token_dict)
	sent_dict['tokens'].sort(key=lambda token: token['idx'])
	return sent_dict


	def file_to_json(input_lines, prefix, shard_id=-1):
	output_jsonlines = []
	for line in tqdm(input_lines, desc="shard_id:%d"%shard_id):
	sent_dict = dependency_parsing("{0}-{1}".format(prefix, len(output_jsonlines)), line)
	# print(sent_dict)
	output_jsonlines.append(json.dumps(sent_dict))
	return output_jsonlines


	if __name__ == "__main__":
	# modify path here: or add a for-loop to traverse the corpus
	flags_parser = argparse.ArgumentParser()
	flags_parser.add_argument('--input_corpus_path', type=str, help='the input file of the corpus, which is a list of plain sentences.')
	flags_parser.add_argument('--output_json_path', type=str, help='the output file of the parses in the jsonl format.')
	flags_parser.add_argument('--prefix', type=str, help='name of the corpus.')
	flags_parser.add_argument('--num_shards', type=int, default=0, help='number of shards')
	flags_parser.add_argument('--shard_id', type=int, default=-1, help='the current shards')

	args = flags_parser.parse_args()

	output_path = args.output_json_path
	all_lines = open(args.input_corpus_path).read().split("\n")[:-1]
	if args.num_shards > 0 and args.shard_id >=0:
	lines = list(np.array_split(all_lines, args.num_shards)[args.shard_id])
	output_path += ".%d-%d"%(args.shard_id, args.num_shards)
	else:
	lines = all_lines
	output_jsonlines = file_to_json(lines, args.prefix, args.shard_id)
	open(output_path, "w").write('\n'.join(output_jsonlines)+"\n")