tiagocordeiro · November 9, 2018 03:35
diff --git a/alison-json-builder.py b/alison-json-builder.py
 text = "Get stylish with DOUBLE BENEFIT at TRENDS! Shop for Rs. 4000 & get benefits worth Rs.4000 on the New season collection from 9th Feb-11th Feb'18 & earn RelianceOne pts. RUSH to ur nearest TRENDS store now! Offer valid in select cities only. T&C More at http://bit.ly/DoubleBenefit "


 sentences = text.split('!')
 words = text.split()

 # print("Palavras: ", words)
 # print("Sentences: ", sentences)


 f = open("teste.json", "w")
 '''
 Mockando o "head" do documento
 e inicia sentences
 '''

 f.write('''{
  "id" : "24ccb660-1c74-11e8-909b-b12abc265234-27",
  "name" : "reliance_trends.txt",
  "createdDate" : 1519814579398,
  "version" : 3,
  "text" : "Get stylish with DOUBLE BENEFIT at TRENDS! Shop for Rs. 4000 & get benefits worth Rs.4000 on the New season collection from 9th Feb-11th Feb'18 & earn RelianceOne pts. RUSH to ur nearest TRENDS store now! Offer valid in select cities only. T&C More at http://bit.ly/DoubleBenefit ",
  "docLength" : 0,
  "language" : "EN",
  "status" : "SUBMITTED",
  "modifiedDate" : 1519836293913,
  "documentSet" : [ ],
  "preannotation" : [ "PRE_SIRE" ],
  "sentences" : [ {''')


 s_count = 0 # sentence count
 w_count = 0 # word count
 bs_count = 0 # begin sentencen count
 es_count = 0 # end sentence count


 for sentence in sentences:
    print(sentence)
    sentence_id = 's'+str(s_count)
    f.write('''
    "id" : "%s",
    "begin" : %s,
    "end" : %s,
    "text" : "%s",
    ''' % (sentence_id, bs_count, es_count + len(sentence), sentence)) 
    s_count+=1
    bs_count+=len(sentence)
    es_count+=bs_count

    f.write('''"tokens" : [ {''')
    
    t_count = 0 # tokens count
    bt_count = 0 # begin token count
    et_count = 0 # end token count

    words = sentence.split()
    for word in words:
        et_count+=len(word)
        print(word)
        token_id = sentence_id+'-t'+str(t_count)
        f.write('''
      "id" : "%s",
      "begin" : %s,
      "end" : %s,
      "text" : "%s",
      "whiteSpace" : false''' % (token_id, bt_count, et_count, word))
        t_count+=1
        bt_count+=len(word)

        f.write('''
    }, {''')
	text = "Get stylish with DOUBLE BENEFIT at TRENDS! Shop for Rs. 4000 & get benefits worth Rs.4000 on the New season collection from 9th Feb-11th Feb'18 & earn RelianceOne pts. RUSH to ur nearest TRENDS store now! Offer valid in select cities only. T&C More at http://bit.ly/DoubleBenefit "


	sentences = text.split('!')
	words = text.split()

	# print("Palavras: ", words)
	# print("Sentences: ", sentences)


	f = open("teste.json", "w")
	'''
	Mockando o "head" do documento
	e inicia sentences
	'''

	f.write('''{
	"id" : "24ccb660-1c74-11e8-909b-b12abc265234-27",
	"name" : "reliance_trends.txt",
	"createdDate" : 1519814579398,
	"version" : 3,
	"text" : "Get stylish with DOUBLE BENEFIT at TRENDS! Shop for Rs. 4000 & get benefits worth Rs.4000 on the New season collection from 9th Feb-11th Feb'18 & earn RelianceOne pts. RUSH to ur nearest TRENDS store now! Offer valid in select cities only. T&C More at http://bit.ly/DoubleBenefit ",
	"docLength" : 0,
	"language" : "EN",
	"status" : "SUBMITTED",
	"modifiedDate" : 1519836293913,
	"documentSet" : [ ],
	"preannotation" : [ "PRE_SIRE" ],
	"sentences" : [ {''')


	s_count = 0 # sentence count
	w_count = 0 # word count
	bs_count = 0 # begin sentencen count
	es_count = 0 # end sentence count


	for sentence in sentences:
	print(sentence)
	sentence_id = 's'+str(s_count)
	f.write('''
	"id" : "%s",
	"begin" : %s,
	"end" : %s,
	"text" : "%s",
	''' % (sentence_id, bs_count, es_count + len(sentence), sentence))
	s_count+=1
	bs_count+=len(sentence)
	es_count+=bs_count

	f.write('''"tokens" : [ {''')

	t_count = 0 # tokens count
	bt_count = 0 # begin token count
	et_count = 0 # end token count

	words = sentence.split()
	for word in words:
	et_count+=len(word)
	print(word)
	token_id = sentence_id+'-t'+str(t_count)
	f.write('''
	"id" : "%s",
	"begin" : %s,
	"end" : %s,
	"text" : "%s",
	"whiteSpace" : false''' % (token_id, bt_count, et_count, word))
	t_count+=1
	bt_count+=len(word)

	f.write('''
	}, {''')