Last active
May 27, 2016 03:44
-
-
Save bitmingw/0607956447ed0dfc6c3bcdaf42fff550 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Web crawler and analysis of matrix67's ideagen | |
Author: Ming Wen | |
""" | |
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
from collections import defaultdict | |
import random | |
import pylab | |
def crawler(log_file_name, request_number): | |
""" | |
Gather data by making HTTP requests, then save to the file. | |
log_file_name: str, file name to save data. | |
request_number: int, total number of desired requests. | |
Return void. | |
""" | |
IDEAGEN_URL = "http://www.matrix67.com/ideagen/" | |
current_number_req = 0 | |
# Read number of existed record in the log file | |
# If the file is not exist, create one | |
try: | |
f = open(log_file_name, "r", encoding="utf-8") | |
existed_data = f.readlines() | |
current_number_req = len(existed_data) | |
except FileNotFoundError: | |
f = open(log_file_name, "w", encoding="utf-8") | |
finally: | |
f.close() | |
# Fill the file with new content | |
with open(log_file_name, "a", encoding="utf-8") as f: | |
while current_number_req < request_number: | |
# Get new requests | |
req = urlopen(IDEAGEN_URL) | |
content_bytes = req.read() | |
content_str = content_bytes.decode() | |
# Parse with BeautifulSoup | |
soup = BeautifulSoup(content_str, "html.parser") | |
word = soup.div.p.string | |
word = word.strip() | |
word += "\n" | |
print(current_number_req+1, word) | |
# Save the result to file | |
f.writelines([word]) | |
# Update the counter | |
current_number_req += 1 | |
def split_word(log_file_name): | |
""" | |
Read data from a file, split each word by the delimiter. | |
log_file_name: str, data file name. | |
Return | |
- adjectives: list of str, before delimiter (exclusive). | |
- nouns: list of str, after delimiter. | |
""" | |
adjectives = [] | |
nouns = [] | |
with open(log_file_name, "r", encoding="utf-8") as f: | |
for word in f: | |
# There could be multiple delimit characters | |
# Assume the first one is what we want | |
delimit_position = word.find("的") | |
adj = word[:delimit_position] | |
n = word[delimit_position+1:] | |
adjectives.append(adj) | |
nouns.append(n) | |
return adjectives, nouns | |
def list_count(li): | |
""" | |
Count the number of repeated elements in a list. | |
li: list of ? | |
Return: dict, key = word, value = number of repeat. | |
""" | |
count_dict = defaultdict(int) | |
for elem in li: | |
count_dict[elem] += 1 | |
return count_dict | |
def simulate_count(class_number, sample_number): | |
""" | |
Simulate the sampling process on elements with replacement. | |
class_number: int, total number of classes | |
sample_number: int, total number of samples | |
Return: dict, key = element, value = number of repeat. | |
""" | |
random.seed("matrix67") | |
li = [random.randrange(0, class_number) for i in range(sample_number)] | |
return list_count(li) | |
def plot_hist(count_dict, bins, title_str): | |
""" | |
Plot the histogram of number of repeat for each word. | |
count_dict: dict, key = word, value = number of repeat. | |
bins: int, the number of bins in the histogram. | |
title_str: str, the title of figure. | |
""" | |
numbers = list(count_dict.values()) | |
pylab.hist(numbers, bins=bins) | |
pylab.xlabel("Number of Repeat") | |
pylab.ylabel("Frequency") | |
pylab.title(title_str) | |
pylab.show() | |
if __name__ == "__main__": | |
TOTAL_REQUESTS = 1000000 | |
crawler("record.txt", TOTAL_REQUESTS) | |
adjectives, nouns = split_word("record.txt") | |
adj_dict = list_count(adjectives) | |
noun_dict = list_count(nouns) | |
adj_dict_num = len(adj_dict.keys()) | |
noun_dict_num = len(noun_dict.keys()) | |
print("Total number of adjectives", adj_dict_num) | |
print("Total number of nouns", noun_dict_num) | |
simulate_adj_dict = simulate_count(adj_dict_num, TOTAL_REQUESTS) | |
simulate_noun_dict = simulate_count(noun_dict_num, TOTAL_REQUESTS) | |
plot_hist(adj_dict, 30, "Adjectives Distribution") | |
plot_hist(noun_dict, 30, "Nouns Distribution") | |
plot_hist(simulate_adj_dict, 30, "Simulated Adjectives Distribution") | |
plot_hist(simulate_noun_dict, 30, "Simulated Nouns Distribution") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment