Created
October 30, 2021 20:51
-
-
Save yuchenlin/41aecfab33042d94b722bb13ca9de17b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
# stanza.download() | |
# http://nlp.stanford.edu/software/stanza/1.0.0/en/default.zip | |
Example usage: | |
CUDA_VISIBLE_DEVICES=1 \ | |
python parsing.py \ | |
--input_corpus_path ./corpora/gkb_best_sent.txt \ | |
--output_json_path ./parses/gkb_best.parses.jsonl \ | |
--prefix gkb_best --num_shards 10000 --shard_id 0 | |
""" | |
import json | |
import argparse | |
import os | |
import numpy as np | |
from tqdm import tqdm | |
import stanza | |
from spacy_stanza import StanzaLanguage | |
snlp = stanza.Pipeline(lang="en", processors='tokenize, pos, lemma, depparse') | |
nlp = StanzaLanguage(snlp) | |
def dependency_parsing(sent_id, sent): | |
doc = nlp(sent) | |
roots = [token for token in doc if token.head == token] | |
if len(roots) != 1: | |
# iff one sentence | |
return None | |
root = roots[0] | |
sent_dict = {'sent_id': sent_id, 'sentence': sent, 'tokens': []} | |
for token in doc: | |
token_dict = {'idx': token.i, # to differentiate same words in a sentence | |
'text': token.text, | |
'pos': token.pos_, | |
'tag': token.tag_, | |
'dep': token.dep_, | |
'lemma': token.lemma_, | |
'parent': token.head.i, | |
'children': []} | |
for child in list(token.children): | |
token_dict['children'].append(child.i) | |
sent_dict["tokens"].append(token_dict) | |
sent_dict['tokens'].sort(key=lambda token: token['idx']) | |
return sent_dict | |
def file_to_json(input_lines, prefix, shard_id=-1): | |
output_jsonlines = [] | |
for line in tqdm(input_lines, desc="shard_id:%d"%shard_id): | |
sent_dict = dependency_parsing("{0}-{1}".format(prefix, len(output_jsonlines)), line) | |
# print(sent_dict) | |
output_jsonlines.append(json.dumps(sent_dict)) | |
return output_jsonlines | |
if __name__ == "__main__": | |
# modify path here: or add a for-loop to traverse the corpus | |
flags_parser = argparse.ArgumentParser() | |
flags_parser.add_argument('--input_corpus_path', type=str, help='the input file of the corpus, which is a list of plain sentences.') | |
flags_parser.add_argument('--output_json_path', type=str, help='the output file of the parses in the jsonl format.') | |
flags_parser.add_argument('--prefix', type=str, help='name of the corpus.') | |
flags_parser.add_argument('--num_shards', type=int, default=0, help='number of shards') | |
flags_parser.add_argument('--shard_id', type=int, default=-1, help='the current shards') | |
args = flags_parser.parse_args() | |
output_path = args.output_json_path | |
all_lines = open(args.input_corpus_path).read().split("\n")[:-1] | |
if args.num_shards > 0 and args.shard_id >=0: | |
lines = list(np.array_split(all_lines, args.num_shards)[args.shard_id]) | |
output_path += ".%d-%d"%(args.shard_id, args.num_shards) | |
else: | |
lines = all_lines | |
output_jsonlines = file_to_json(lines, args.prefix, args.shard_id) | |
open(output_path, "w").write('\n'.join(output_jsonlines)+"\n") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment