Created
March 4, 2013 22:08
-
-
Save peregrinogris/5086139 to your computer and use it in GitHub Desktop.
Split text into sentences and output that json.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from BeautifulSoup import BeautifulSoup | |
import json | |
import sys | |
if len(sys.argv) < 2: | |
print sys.argv[0] + ' <input file>' | |
else: | |
in_file = sys.argv[1] | |
if len(sys.argv) < 3: | |
tag = 'div' | |
else: | |
tag = sys.argv[2] | |
soup = BeautifulSoup(open('./'+sys.argv[1])) | |
out = [] | |
last_dot = False | |
for text in soup.findAll(tag): | |
lines = text.getText(" ").lstrip().rstrip() | |
if len(lines) > 0: | |
last_dot = lines[-1] == "." | |
lines = lines.split('.') | |
for i in range(len(lines)): | |
line = lines[i] | |
if len(line) > 1: | |
line = line.lstrip().rstrip() | |
if i < len(lines)-1: | |
line = line+"." | |
elif last_dot: | |
line = line+"." | |
out.append(line) | |
print json.dumps(out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment