typehorror · September 21, 2024 10:20
diff --git a/detector.md b/detector.md
diff --git a/plagia.py b/plagia.py
 #!/usr/bin/python
 # filename: plagia.py

 # native libs
 import sys  
 import re  
 from collections import defaultdict

 # external libs
 from bs4 import BeautifulSoup

 # local libs
 from crawler import CrawlerCache

 crawler_cache = CrawlerCache('crawler.db')

 # Config per domain
 sites = (  
    {
        'domain': 'techcrunch.com',
        'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match,
        'get_content': lambda page: page.find('div', 'article-entry'),
        'words': defaultdict(int),
        'urls': {},
    },
    {
        'domain': 'www.engadget.com',
        'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match,
        'get_content': lambda page: page.find('div', 'post-body'),
        'words': defaultdict(int),
        'urls': {},
    },
    {
        'domain': 'gizmodo.com',
        'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}').match,
        'get_content': lambda page: page.find('article', 'post'),
        'words': defaultdict(int),
        'urls': {},
    },
    {
        'domain': 'www.zdnet.com',
        'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}/$').match,
        'get_content': lambda page: page.find('article', 'post'),
        'words': defaultdict(int),
        'urls': {},
    },
    {
        'domain': 'www.wired.com',
        'url_re': re.compile('^/\d{4}/\d{2}/[^/]*/$').match,
        'get_content': lambda page: page.find('article', 'post'),
        'words': defaultdict(int),
        'urls': {},
    }
 )

 # store all the words for statistical value

 # filter the URLs, extract the content, get the words
 for site in sites:  
    domain = site['domain']
    # retrieve all the URL for current domain matching an article format
    urls = set([u for u in crawler_cache.get_urls(domain) if site['url_re'](u)])
    for url in urls:
        html = crawler_cache.get(domain=domain, url=url)
        if html:
            # User beautifulSoup to navigate the document
            page = BeautifulSoup(html)
            # retrive the content of the article
            content = site['get_content'](page)
            if content:
                # remove script tag content from article
                # yes, there is JS in the article :/
                [s.clear() for s in content.find_all('script')]
                # trim the tags from the article
                article_words = content.get_text().split()
                # articles with less than 200 words kind of suck
                # so let ignore those
                if len(article_words) > 200:
                    # keep uniq words by putting them in a set
                    article_words = set(w.lower() for w in article_words)
                    site['urls'][url] = article_words
                    # count the words occurence (per domain and globally)
                    for word in article_words:
                        site['words'][word] += 1


 # Now lets remove words common in the article of the domain
 for site in sites:  
    # words present over 5% of the articles of the domain are removed
    threshold = len(site['urls']) * .05
    noisy_words = set(w for w, c in site['words'].items() if c > threshold)
    for url in site['urls'].keys():
        # remove part using set difference feature, pretty sweet
        site['urls'][url] = site['urls'][url].difference(noisy_words)


 # We can now compare article to each others across domains
 plagia = defaultdict(list)  
 for site in sites:  
    for other_site in sites:
        # We don't match site against itself :|
        if other_site['domain'] == site['domain']:
            continue
        # grab every articles for the domain
        for url in site['urls'].keys():
            # words on the current article
            words = site['urls'][url]
            # minumum match has to be 10%
            best_score = len(words) * .1
            match = ''
            # compare article to the another domain's articles
            for other_url in other_site['urls'].keys():
                # words in the article from another domain
                other_words = other_site['urls'][other_url]
                # count how many common "rare" words
                score = len(words.intersection(other_words))
                if score > best_score:
                    # woohoo, if you're here you're the new best match
                    match = other_url
                    best_score = score
            if match:
                full_url = 'http://%s%s' % (site['domain'], url)
                full_other_url = 'http://%s%s' % (other_site['domain'], match)
                plagia[full_url].append((
                    best_score,
                    (best_score * 100.0) / len(words), # percentage
                    full_other_url,
                ))

 for url, matches in plagia.items():  
    print url
    for match in matches:
        print '\t%s\t%.d%%\t%s' % match
diff --git a/run.py b/run.py
 #!/usr/bin/python
 # filename: run.py
 import re  
 from crawler import Crawler, CrawlerCache

 if __name__ == "__main__":  
    # Using SQLite as a cache to avoid pulling twice
    crawler = Crawler(CrawlerCache('crawler.db'))
    root_re = re.compile('^/$').match
    crawler.crawl('http://techcrunch.com/', no_cache=root_re)
    crawler.crawl('http://www.engadget.com/', no_cache=root_re)
    crawler.crawl('http://gizmodo.com/', no_cache=root_re)
    crawler.crawl('http://www.zdnet.com/', no_cache=root_re)
    crawler.crawl('http://www.wired.com/', no_cache=root_re)
	#!/usr/bin/python
	# filename: plagia.py

	# native libs
	import sys
	import re
	from collections import defaultdict

	# external libs
	from bs4 import BeautifulSoup

	# local libs
	from crawler import CrawlerCache

	crawler_cache = CrawlerCache('crawler.db')

	# Config per domain
	sites = (
	{
	'domain': 'techcrunch.com',
	'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match,
	'get_content': lambda page: page.find('div', 'article-entry'),
	'words': defaultdict(int),
	'urls': {},
	},
	{
	'domain': 'www.engadget.com',
	'url_re': re.compile('^/\d{4}/\d{2}/\d{2}/*').match,
	'get_content': lambda page: page.find('div', 'post-body'),
	'words': defaultdict(int),
	'urls': {},
	},
	{
	'domain': 'gizmodo.com',
	'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}').match,
	'get_content': lambda page: page.find('article', 'post'),
	'words': defaultdict(int),
	'urls': {},
	},
	{
	'domain': 'www.zdnet.com',
	'url_re': re.compile('^/[a-z0-9\-]*-\d{5,12}/$').match,
	'get_content': lambda page: page.find('article', 'post'),
	'words': defaultdict(int),
	'urls': {},
	},
	{
	'domain': 'www.wired.com',
	'url_re': re.compile('^/\d{4}/\d{2}/[^/]*/$').match,
	'get_content': lambda page: page.find('article', 'post'),
	'words': defaultdict(int),
	'urls': {},
	}
	)

	# store all the words for statistical value

	# filter the URLs, extract the content, get the words
	for site in sites:
	domain = site['domain']
	# retrieve all the URL for current domain matching an article format
	urls = set([u for u in crawler_cache.get_urls(domain) if site['url_re'](u)])
	for url in urls:
	html = crawler_cache.get(domain=domain, url=url)
	if html:
	# User beautifulSoup to navigate the document
	page = BeautifulSoup(html)
	# retrive the content of the article
	content = site['get_content'](page)
	if content:
	# remove script tag content from article
	# yes, there is JS in the article :/
	[s.clear() for s in content.find_all('script')]
	# trim the tags from the article
	article_words = content.get_text().split()
	# articles with less than 200 words kind of suck
	# so let ignore those
	if len(article_words) > 200:
	# keep uniq words by putting them in a set
	article_words = set(w.lower() for w in article_words)
	site['urls'][url] = article_words
	# count the words occurence (per domain and globally)
	for word in article_words:
	site['words'][word] += 1


	# Now lets remove words common in the article of the domain
	for site in sites:
	# words present over 5% of the articles of the domain are removed
	threshold = len(site['urls']) * .05
	noisy_words = set(w for w, c in site['words'].items() if c > threshold)
	for url in site['urls'].keys():
	# remove part using set difference feature, pretty sweet
	site['urls'][url] = site['urls'][url].difference(noisy_words)


	# We can now compare article to each others across domains
	plagia = defaultdict(list)
	for site in sites:
	for other_site in sites:
	# We don't match site against itself :\|
	if other_site['domain'] == site['domain']:
	continue
	# grab every articles for the domain
	for url in site['urls'].keys():
	# words on the current article
	words = site['urls'][url]
	# minumum match has to be 10%
	best_score = len(words) * .1
	match = ''
	# compare article to the another domain's articles
	for other_url in other_site['urls'].keys():
	# words in the article from another domain
	other_words = other_site['urls'][other_url]
	# count how many common "rare" words
	score = len(words.intersection(other_words))
	if score > best_score:
	# woohoo, if you're here you're the new best match
	match = other_url
	best_score = score
	if match:
	full_url = 'http://%s%s' % (site['domain'], url)
	full_other_url = 'http://%s%s' % (other_site['domain'], match)
	plagia[full_url].append((
	best_score,
	(best_score * 100.0) / len(words), # percentage
	full_other_url,
	))

	for url, matches in plagia.items():
	print url
	for match in matches:
	print '\t%s\t%.d%%\t%s' % match
	#!/usr/bin/python
	# filename: run.py
	import re
	from crawler import Crawler, CrawlerCache

	if __name__ == "__main__":
	# Using SQLite as a cache to avoid pulling twice
	crawler = Crawler(CrawlerCache('crawler.db'))
	root_re = re.compile('^/$').match
	crawler.crawl('http://techcrunch.com/', no_cache=root_re)
	crawler.crawl('http://www.engadget.com/', no_cache=root_re)
	crawler.crawl('http://gizmodo.com/', no_cache=root_re)
	crawler.crawl('http://www.zdnet.com/', no_cache=root_re)
	crawler.crawl('http://www.wired.com/', no_cache=root_re)