Skip to content

Instantly share code, notes, and snippets.

@tiagocordeiro
Last active November 9, 2018 03:35
Show Gist options
  • Save tiagocordeiro/4c14a1e85390e70154a94d1182f47b03 to your computer and use it in GitHub Desktop.
Save tiagocordeiro/4c14a1e85390e70154a94d1182f47b03 to your computer and use it in GitHub Desktop.
text = "Get stylish with DOUBLE BENEFIT at TRENDS! Shop for Rs. 4000 & get benefits worth Rs.4000 on the New season collection from 9th Feb-11th Feb'18 & earn RelianceOne pts. RUSH to ur nearest TRENDS store now! Offer valid in select cities only. T&C More at http://bit.ly/DoubleBenefit "
sentences = text.split('!')
words = text.split()
# print("Palavras: ", words)
# print("Sentences: ", sentences)
f = open("teste.json", "w")
'''
Mockando o "head" do documento
e inicia sentences
'''
f.write('''{
"id" : "24ccb660-1c74-11e8-909b-b12abc265234-27",
"name" : "reliance_trends.txt",
"createdDate" : 1519814579398,
"version" : 3,
"text" : "Get stylish with DOUBLE BENEFIT at TRENDS! Shop for Rs. 4000 & get benefits worth Rs.4000 on the New season collection from 9th Feb-11th Feb'18 & earn RelianceOne pts. RUSH to ur nearest TRENDS store now! Offer valid in select cities only. T&C More at http://bit.ly/DoubleBenefit ",
"docLength" : 0,
"language" : "EN",
"status" : "SUBMITTED",
"modifiedDate" : 1519836293913,
"documentSet" : [ ],
"preannotation" : [ "PRE_SIRE" ],
"sentences" : [ {''')
s_count = 0 # sentence count
w_count = 0 # word count
bs_count = 0 # begin sentencen count
es_count = 0 # end sentence count
for sentence in sentences:
print(sentence)
sentence_id = 's'+str(s_count)
f.write('''
"id" : "%s",
"begin" : %s,
"end" : %s,
"text" : "%s",
''' % (sentence_id, bs_count, es_count + len(sentence), sentence))
s_count+=1
bs_count+=len(sentence)
es_count+=bs_count
f.write('''"tokens" : [ {''')
t_count = 0 # tokens count
bt_count = 0 # begin token count
et_count = 0 # end token count
words = sentence.split()
for word in words:
et_count+=len(word)
print(word)
token_id = sentence_id+'-t'+str(t_count)
f.write('''
"id" : "%s",
"begin" : %s,
"end" : %s,
"text" : "%s",
"whiteSpace" : false''' % (token_id, bt_count, et_count, word))
t_count+=1
bt_count+=len(word)
f.write('''
}, {''')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment