Skip to content

Instantly share code, notes, and snippets.

@yuchenlin
Created October 30, 2021 20:51
Show Gist options
  • Save yuchenlin/41aecfab33042d94b722bb13ca9de17b to your computer and use it in GitHub Desktop.
Save yuchenlin/41aecfab33042d94b722bb13ca9de17b to your computer and use it in GitHub Desktop.
"""
# stanza.download()
# http://nlp.stanford.edu/software/stanza/1.0.0/en/default.zip
Example usage:
CUDA_VISIBLE_DEVICES=1 \
python parsing.py \
--input_corpus_path ./corpora/gkb_best_sent.txt \
--output_json_path ./parses/gkb_best.parses.jsonl \
--prefix gkb_best --num_shards 10000 --shard_id 0
"""
import json
import argparse
import os
import numpy as np
from tqdm import tqdm
import stanza
from spacy_stanza import StanzaLanguage
snlp = stanza.Pipeline(lang="en", processors='tokenize, pos, lemma, depparse')
nlp = StanzaLanguage(snlp)
def dependency_parsing(sent_id, sent):
doc = nlp(sent)
roots = [token for token in doc if token.head == token]
if len(roots) != 1:
# iff one sentence
return None
root = roots[0]
sent_dict = {'sent_id': sent_id, 'sentence': sent, 'tokens': []}
for token in doc:
token_dict = {'idx': token.i, # to differentiate same words in a sentence
'text': token.text,
'pos': token.pos_,
'tag': token.tag_,
'dep': token.dep_,
'lemma': token.lemma_,
'parent': token.head.i,
'children': []}
for child in list(token.children):
token_dict['children'].append(child.i)
sent_dict["tokens"].append(token_dict)
sent_dict['tokens'].sort(key=lambda token: token['idx'])
return sent_dict
def file_to_json(input_lines, prefix, shard_id=-1):
output_jsonlines = []
for line in tqdm(input_lines, desc="shard_id:%d"%shard_id):
sent_dict = dependency_parsing("{0}-{1}".format(prefix, len(output_jsonlines)), line)
# print(sent_dict)
output_jsonlines.append(json.dumps(sent_dict))
return output_jsonlines
if __name__ == "__main__":
# modify path here: or add a for-loop to traverse the corpus
flags_parser = argparse.ArgumentParser()
flags_parser.add_argument('--input_corpus_path', type=str, help='the input file of the corpus, which is a list of plain sentences.')
flags_parser.add_argument('--output_json_path', type=str, help='the output file of the parses in the jsonl format.')
flags_parser.add_argument('--prefix', type=str, help='name of the corpus.')
flags_parser.add_argument('--num_shards', type=int, default=0, help='number of shards')
flags_parser.add_argument('--shard_id', type=int, default=-1, help='the current shards')
args = flags_parser.parse_args()
output_path = args.output_json_path
all_lines = open(args.input_corpus_path).read().split("\n")[:-1]
if args.num_shards > 0 and args.shard_id >=0:
lines = list(np.array_split(all_lines, args.num_shards)[args.shard_id])
output_path += ".%d-%d"%(args.shard_id, args.num_shards)
else:
lines = all_lines
output_jsonlines = file_to_json(lines, args.prefix, args.shard_id)
open(output_path, "w").write('\n'.join(output_jsonlines)+"\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment