Created
June 27, 2016 05:20
-
-
Save keymastervn/c4e3826b08e183661a4377341510e4e9 to your computer and use it in GitHub Desktop.
crawler.rb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'nokogiri' | |
| require 'open-uri' | |
| require 'rubygems' | |
| require '../string_helper' | |
| SOURCE = [ | |
| "http://vnexpress.net", | |
| "http://tuoitre.vn", | |
| "http://news.zing.vn", | |
| "http://vietnamnet.vn", | |
| "http://kenh14.vn" | |
| ] | |
| HREF_SKIP_SUFFIX = [ | |
| ".img", ".imfg", ".jpeg", ".jpg", ".png", ".rss", | |
| ".css", ".js", "javascript", "document", "mailto:", "e.vnexpress.net", | |
| "video", "facebook", "google", "twitter", "adv", "pdf", "contact", "lienhe", "lien-he", "#" | |
| ] | |
| SENTENCE_SKIP_NOT_INCLUDE = [ | |
| "@", "javascript", "Javascript", "Facebook", "Google", "Twitter", "FB", "facebook", "google", | |
| "®", "vnexpress", "VnExpress", "Tuoitre", "tuoitre", "kenh14", "zing", "vietnamnet", | |
| "browser", "-", "\"", "\'", "\n", "\(", "\)", "\/", "%", '“', '”', "&", "$", ":", "_" | |
| ] | |
| $output = "crawl_result.txt" | |
| SENTENCE_MIN_LENGTH = 25 | |
| MAXSIZE = 200_000_000 # 100MB of text | |
| MAXSTACKCOUNT = 2000 | |
| $dict = {} | |
| $new_word = [] | |
| $result = File.open($output, "w:UTF-8") | |
| class Crawler | |
| def initialize(link) | |
| @topsite = link | |
| @list_link = [] | |
| @mutex = Mutex.new | |
| @queue = [] | |
| @crawled_sites = {} | |
| @crawled_sentences = {} | |
| start_crawler | |
| start_fetchers | |
| end | |
| def queue | |
| @mutex.synchronize do | |
| @queue | |
| end | |
| end | |
| def start_crawler | |
| queue.push(@topsite) | |
| end | |
| def start_fetchers | |
| @fetcher_threads = [] | |
| 3.times { | |
| @fetcher_threads << Thread.new { | |
| loop do | |
| if url = queue.shift | |
| crawl(*url) if @crawled_sites[*url].nil? | |
| end | |
| next unless queue.empty? | |
| sleep 0.5 | |
| end | |
| } | |
| } | |
| @fetcher_threads.each{|t| t.join} | |
| end | |
| def crawl(link) | |
| puts "crawl: #{link}" | |
| exit(true) if File.size($output) > MAXSIZE | |
| not_crawl_two_times(link) | |
| begin | |
| page = Nokogiri::HTML(open(link)) | |
| rescue | |
| return | |
| end | |
| $result.write(get_sentence(page).join("\n") + "\n") | |
| get_child_link(page).each{ |child_link| | |
| queue.push(child_link) | |
| } | |
| end | |
| def get_sentence(page) | |
| sentences = [] | |
| page.css('p').collect {|l| | |
| line = l.children.to_s.gsub(/<\/?[^>]*>/, "") # Remove HTML tags | |
| sentences += line.squeeze(" ").squeeze(".").split(".").collect {|sentence| | |
| sentence_sanitize(sentence) | |
| } | |
| } | |
| @mutex.synchronize do | |
| sentences.collect {|s| s if @crawled_sentences[s].nil? } | |
| sentences.each {|s| @crawled_sentences[s] = true } | |
| end | |
| sentences.delete_if {|s| s.nil? || s.empty? } | |
| end | |
| def not_crawl_two_times(link) | |
| @mutex.synchronize do | |
| @crawled_sites[link] = true | |
| end | |
| end | |
| def get_child_link(page) | |
| get_all_link_from_page(page).delete_if {|link| | |
| link.nil? || @crawled_sites[link] || | |
| HREF_SKIP_SUFFIX.any? {|s| link.include? s} || | |
| !SOURCE.any? {|site| link.start_with? site} | |
| } | |
| end | |
| def get_all_link_from_page(page) | |
| hrefs = page.css('a').map(&:attributes["href"]) | |
| hrefs.collect {|href| | |
| full_href(href.attributes["href"].value) if href.attributes["href"].respond_to?(:value) && | |
| href.attributes["href"].value.is_valid_url | |
| } | |
| end | |
| def full_href(string) | |
| return File.join(@topsite, string) if string.is_category | |
| return string | |
| end | |
| def sentence_sanitize(string) | |
| return nil if string.length < SENTENCE_MIN_LENGTH || | |
| string =~ /\d/ || SENTENCE_SKIP_NOT_INCLUDE.any? {|s| string.include? s} | |
| return string.gsub("?", " ").gsub(" ", " ").gsub(","," ").squeeze(" ").strip | |
| end | |
| end | |
| threads = [] | |
| SOURCE.each do |site| | |
| threads << Thread.new do | |
| Crawler.new(site) | |
| end | |
| end | |
| threads.map(&:join) | |
| # a = File.open("crawl_result.txt", "r:UTF-8").readlines | |
| # f = File.open("result.txt", "w:UTF-8") | |
| # f.write(a.join("").gsub("\n\n", "\n")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment