This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
import codecs # UniCode support | |
from pymongo import Connection # For DB Connection | |
from pymongo.errors import ConnectionFailure # For catching exeptions | |
def main(): | |
# MongoDB connection | |
try: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def main(): | |
# Command-line parsing supports filename*.txt | |
# Make a list of command line arguments, omitting the [0] element which is the script itself. | |
args = sys.argv[1:] | |
if not args: | |
print 'Some message.' | |
sys.exit(1) | |
for filename in args: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re # Regular Expressions | |
import collections # Data Types | |
import sys # File operations | |
import codecs # UniCode support | |
def scrape(page): | |
# Dump raw HTML into Soup | |
raw_data = codecs.open(page, 'r', encoding='utf-8').read() | |
soup = BeautifulSoup(raw_data) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
soup(lambda tag: tag.name == 'div' and tag.get('class') == ['some-class']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, random | |
random.choice(os.listdir("INSERT-DIR")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Non-recursive | |
import os | |
def load_directory(data_path): | |
files_list = [] | |
try: | |
for file_name in os.listdir(data_path): | |
if file_name.endswith(".html"): | |
files_list.append(file_name) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
file_header = "<html>\n<head>\n<meta charset='utf-8'>\n</head>\n<body>\n" | |
file_footer = "\n</body>\n</html>" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def longest_common_substring(s1, s2): | |
m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))] | |
longest, x_up_to = 0, 0 | |
for x in range(1, 1 + len(s1)): | |
for y in range(1, 1 + len(s2)): # match every char in s2 against every char in s1 | |
if s1[x - 1] == s2[y - 1]: # record a char match | |
m[x][y] = m[x - 1][y - 1] + 1 # char match tally will accumulate if previous char also matched | |
if m[x][y] > longest: | |
longest = m[x][y] | |
x_up_to = x # record char position of last found match |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
AA ɑ | |
AA0 ɑ | |
AA1 ɑ | |
AA2 ɑ | |
AE æ | |
AE0 æ | |
AE1 æ | |
AE2 æ | |
AH ə | |
AH0 ə |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re # Regular Expressions | |
import collections # Data Types | |
import sys # File operations | |
import codecs # UniCode support | |
import os | |
def clear_output_file(out_file): | |
file_header ="""<html> | |
<head> |
OlderNewer