This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#eugene script | |
def eugener(path = 'data/nyt/earlylate', | |
regex = r'(?i)\brisk', | |
depth = 5, | |
top = 10, | |
remove_stopwords = False): | |
""" | |
get most frequent words in corpus path to left and right |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def eugener(path = 'data/nyt/earlylate', depth = 5, top = 10): | |
import os | |
import nltk | |
import re | |
from collections import Counter | |
import pandas as pd | |
# get list of subcorpora | |
dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] | |
# define risk word | |
regex = r'(?i)\brisk' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1995 | |
[-5, [('the', 351), ('to', 226), ('of', 159), ('and', 157), ('a', 157), ('that', 156), ('in', 93), ('is', 80), ('for', 72), ('not', 68)]] | |
[-4, [('the', 333), ('to', 199), ('a', 164), ('that', 161), ('of', 140), ('and', 133), ('it', 90), ('are', 89), ('is', 83), ('in', 79)]] | |
[-3, [('to', 419), ('the', 263), ('a', 164), ('and', 150), ('that', 135), ('of', 124), ('are', 116), ('is', 113), ('there', 108), ('it', 98)]] | |
[-2, [('the', 396), ('a', 292), ('of', 232), ('to', 204), ('is', 180), ('and', 180), ('at', 178), ('take', 128), ('that', 126), ('are', 97)]] | |
[-1, [('the', 1733), ('at', 549), ('a', 432), ('and', 212), ('to', 191), ('of', 188), ('high', 127), ('health', 91), ('increased', 84), ('their', 77)]] | |
[0, [('risk', 4749), ('risks', 1611), ('risky', 684), ('risking', 157), ('risked', 141), ('riskier', 82), ('riskiest', 32), ('riske', 31), ('riskin', 13), ('riskiness', 12)]] | |
[1, [('of', 1591), ('to', 297), ('and', 240), ('for', 218), ('that', 196), ('in', 160), ('is', 116), ('the', 114), ('factors', 110), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!sudo yum -y install java | |
!git clone https://www.github.com/interrogator/risk | |
import corpkit | |
from corpkit import interrogator, plotter, quickview | |
import pandas as pd | |
corpus = 'data/nyt/years' | |
#immediate sister to left of risk word | |
query = r'__ $. /(?i).?\brisk.?/' | |
# interrogate, output words only |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def parse_sfl(n = 3): | |
from bs4 import BeautifulSoup | |
import os | |
from collections import defaultdict | |
# path to xml files | |
xmlpath = 'XML' | |
# list of sfl categories | |
sfl_list = [ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extract () { | |
if [ -f $1 ] ; then | |
case $1 in | |
*.tar.bz2) tar xjf $1 ;; | |
*.tar.gz) tar xzf $1 ;; | |
*.bz2) bunzip2 $1 ;; | |
*.rar) rar x $1 ;; | |
*.gz) gunzip $1 ;; | |
*.tar) tar xf $1 ;; | |
*.tbz2) tar xjf $1 ;; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def quicktree(sentence): | |
"""Parse a sentence and return a visual representation""" | |
from nltk import Tree | |
from nltk.draw.util import CanvasFrame | |
from nltk.draw import TreeWidget | |
from stat_parser import Parser | |
from IPython.display import display | |
from IPython.display import Image | |
parser = Parser() | |
parsed = parser.parse(sentence) |
NewerOlder