|
#!/usr/bin/python |
|
# filename: plagia.py |
|
|
|
# native libs |
|
import sys |
|
import re |
|
from collections import defaultdict |
|
|
|
# external libs |
|
from bs4 import BeautifulSoup |
|
|
|
# local libs |
|
from crawler import CrawlerCache |
|
|
|
crawler_cache = CrawlerCache('crawler.db') |
|
|
|
# Config per domain |
|
sites = ( |
|
{ |
|
'domain': 'techcrunch.com', |
|
'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match, |
|
'get_content': lambda page: page.find('div', 'article-entry'), |
|
'words': defaultdict(int), |
|
'urls': {}, |
|
}, |
|
{ |
|
'domain': 'www.engadget.com', |
|
'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match, |
|
'get_content': lambda page: page.find('div', 'post-body'), |
|
'words': defaultdict(int), |
|
'urls': {}, |
|
}, |
|
{ |
|
'domain': 'gizmodo.com', |
|
'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}').match, |
|
'get_content': lambda page: page.find('article', 'post'), |
|
'words': defaultdict(int), |
|
'urls': {}, |
|
}, |
|
{ |
|
'domain': 'www.zdnet.com', |
|
'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}/$').match, |
|
'get_content': lambda page: page.find('article', 'post'), |
|
'words': defaultdict(int), |
|
'urls': {}, |
|
}, |
|
{ |
|
'domain': 'www.wired.com', |
|
'url_re': re.compile('^/\d{4}/\d{2}/[^/]*/$').match, |
|
'get_content': lambda page: page.find('article', 'post'), |
|
'words': defaultdict(int), |
|
'urls': {}, |
|
} |
|
) |
|
|
|
# store all the words for statistical value |
|
|
|
# filter the URLs, extract the content, get the words |
|
for site in sites: |
|
domain = site['domain'] |
|
# retrieve all the URL for current domain matching an article format |
|
urls = set([u for u in crawler_cache.get_urls(domain) if site['url_re'](u)]) |
|
for url in urls: |
|
html = crawler_cache.get(domain=domain, url=url) |
|
if html: |
|
# User beautifulSoup to navigate the document |
|
page = BeautifulSoup(html) |
|
# retrive the content of the article |
|
content = site['get_content'](page) |
|
if content: |
|
# remove script tag content from article |
|
# yes, there is JS in the article :/ |
|
[s.clear() for s in content.find_all('script')] |
|
# trim the tags from the article |
|
article_words = content.get_text().split() |
|
# articles with less than 200 words kind of suck |
|
# so let ignore those |
|
if len(article_words) > 200: |
|
# keep uniq words by putting them in a set |
|
article_words = set(w.lower() for w in article_words) |
|
site['urls'][url] = article_words |
|
# count the words occurence (per domain and globally) |
|
for word in article_words: |
|
site['words'][word] += 1 |
|
|
|
|
|
# Now lets remove words common in the article of the domain |
|
for site in sites: |
|
# words present over 5% of the articles of the domain are removed |
|
threshold = len(site['urls']) * .05 |
|
noisy_words = set(w for w, c in site['words'].items() if c > threshold) |
|
for url in site['urls'].keys(): |
|
# remove part using set difference feature, pretty sweet |
|
site['urls'][url] = site['urls'][url].difference(noisy_words) |
|
|
|
|
|
# We can now compare article to each others across domains |
|
plagia = defaultdict(list) |
|
for site in sites: |
|
for other_site in sites: |
|
# We don't match site against itself :| |
|
if other_site['domain'] == site['domain']: |
|
continue |
|
# grab every articles for the domain |
|
for url in site['urls'].keys(): |
|
# words on the current article |
|
words = site['urls'][url] |
|
# minumum match has to be 10% |
|
best_score = len(words) * .1 |
|
match = '' |
|
# compare article to the another domain's articles |
|
for other_url in other_site['urls'].keys(): |
|
# words in the article from another domain |
|
other_words = other_site['urls'][other_url] |
|
# count how many common "rare" words |
|
score = len(words.intersection(other_words)) |
|
if score > best_score: |
|
# woohoo, if you're here you're the new best match |
|
match = other_url |
|
best_score = score |
|
if match: |
|
full_url = 'http://%s%s' % (site['domain'], url) |
|
full_other_url = 'http://%s%s' % (other_site['domain'], match) |
|
plagia[full_url].append(( |
|
best_score, |
|
(best_score * 100.0) / len(words), # percentage |
|
full_other_url, |
|
)) |
|
|
|
for url, matches in plagia.items(): |
|
print url |
|
for match in matches: |
|
print '\t%s\t%.d%%\t%s' % match |