Last active
November 9, 2018 03:35
-
-
Save tiagocordeiro/4c14a1e85390e70154a94d1182f47b03 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
text = "Get stylish with DOUBLE BENEFIT at TRENDS! Shop for Rs. 4000 & get benefits worth Rs.4000 on the New season collection from 9th Feb-11th Feb'18 & earn RelianceOne pts. RUSH to ur nearest TRENDS store now! Offer valid in select cities only. T&C More at http://bit.ly/DoubleBenefit " | |
sentences = text.split('!') | |
words = text.split() | |
# print("Palavras: ", words) | |
# print("Sentences: ", sentences) | |
f = open("teste.json", "w") | |
''' | |
Mockando o "head" do documento | |
e inicia sentences | |
''' | |
f.write('''{ | |
"id" : "24ccb660-1c74-11e8-909b-b12abc265234-27", | |
"name" : "reliance_trends.txt", | |
"createdDate" : 1519814579398, | |
"version" : 3, | |
"text" : "Get stylish with DOUBLE BENEFIT at TRENDS! Shop for Rs. 4000 & get benefits worth Rs.4000 on the New season collection from 9th Feb-11th Feb'18 & earn RelianceOne pts. RUSH to ur nearest TRENDS store now! Offer valid in select cities only. T&C More at http://bit.ly/DoubleBenefit ", | |
"docLength" : 0, | |
"language" : "EN", | |
"status" : "SUBMITTED", | |
"modifiedDate" : 1519836293913, | |
"documentSet" : [ ], | |
"preannotation" : [ "PRE_SIRE" ], | |
"sentences" : [ {''') | |
s_count = 0 # sentence count | |
w_count = 0 # word count | |
bs_count = 0 # begin sentencen count | |
es_count = 0 # end sentence count | |
for sentence in sentences: | |
print(sentence) | |
sentence_id = 's'+str(s_count) | |
f.write(''' | |
"id" : "%s", | |
"begin" : %s, | |
"end" : %s, | |
"text" : "%s", | |
''' % (sentence_id, bs_count, es_count + len(sentence), sentence)) | |
s_count+=1 | |
bs_count+=len(sentence) | |
es_count+=bs_count | |
f.write('''"tokens" : [ {''') | |
t_count = 0 # tokens count | |
bt_count = 0 # begin token count | |
et_count = 0 # end token count | |
words = sentence.split() | |
for word in words: | |
et_count+=len(word) | |
print(word) | |
token_id = sentence_id+'-t'+str(t_count) | |
f.write(''' | |
"id" : "%s", | |
"begin" : %s, | |
"end" : %s, | |
"text" : "%s", | |
"whiteSpace" : false''' % (token_id, bt_count, et_count, word)) | |
t_count+=1 | |
bt_count+=len(word) | |
f.write(''' | |
}, {''') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment