This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Script tags POS and NER[Named Entity Recognition] for a supplied text file. | |
#Date: Nov 2 2012 | |
#Author: Hota Sobhan | |
import nltk | |
f = open('C:\Python27\Test_File.txt') | |
data = f.readlines() | |
#Parse the text file for NER with POS Tagging |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Author: Sobhan Hota | |
#Date: Oct 20 2012 | |
#Script can run to generate vector for bigram collected in Source File | |
#captures the count from the supplied input file (if present), then divide by | |
#input file document length. | |
import itertools | |
from collections import Counter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Author: Sobhan Hota | |
#Finds most frequent 500 words in a given file | |
from string import punctuation | |
from operator import itemgetter | |
N = 500 | |
words = {} | |
words_gen = (word.strip(punctuation).lower() for line in open("C:\Python27\Corpus.txt") |
NewerOlder