earlwlkr · January 29, 2015 10:45 · phuongnm94 · Oct 16, 2016
diff --git a/rss-parser.py b/rss-parser.py
 # -*- coding: utf-8 -*-
 # Python code to parse news content from VnExpress RSS Feeds.
 import os
 import re
 from bs4 import BeautifulSoup   # external lib
 import requests                 # external lib
 import feedparser               # external lib


 MAX_LINKS = 5


 corpus_file = open('corpus.txt', mode='a', encoding='utf8')

 rss_re = re.compile(r'/rss/[a-z-?]+.rss', flags=re.UNICODE)
 word_re = re.compile('(\w+)', flags=re.UNICODE)  # Chưa chính xác.

 parsed_rss = []
 parsed_links = []

 stop = False

 main_url = 'http://vnexpress.net'
 main_soup = BeautifulSoup(requests.get(main_url + '/rss').content)
 session = requests.Session()

 # Lọc tất cả link trong trang RSS gốc để tìm link RSS.
 for a in main_soup.find_all('a'):
    if 'href' in a.attrs and rss_re.match(a['href']):
        rss_link = main_url + a['href']

        if rss_link not in parsed_rss:
            print('Parsing RSS: ' + rss_link)
            feed = feedparser.parse(rss_link)
            items = feed['items']

            for item in items:
                try:
                    link = item['link']
                    if link in parsed_links:
                        continue

                    print('Parsing article: ' + link)
                    content = session.get(link).content
                    soup = BeautifulSoup(content)

                    for tag in soup.find_all('p'):
                        # Lấy nội dung bài tin tức (class='Normal').
                        if 'class' in tag.attrs and 'Normal' in tag['class']:
                            if tag.string:
                                for word in word_re.findall(tag.string):
                                    corpus_file.write(word + ' ')
                                corpus_file.write('\n')

                    parsed_links.append(link)
                    if len(parsed_links) == MAX_LINKS:
                        stop = True
                        break
                except:
                    print('Error, skipping...')
                    continue
                if stop:
                    break
            parsed_rss.append(rss_link)
        if stop:
            break

 corpus_file.close()
 print('\n\nParsed a total of {0} articles.'.format(len(parsed_links)))
	# -- coding: utf-8 --
	# Python code to parse news content from VnExpress RSS Feeds.
	import os
	import re
	from bs4 import BeautifulSoup # external lib
	import requests # external lib
	import feedparser # external lib


	MAX_LINKS = 5


	corpus_file = open('corpus.txt', mode='a', encoding='utf8')

	rss_re = re.compile(r'/rss/[a-z-?]+.rss', flags=re.UNICODE)
	word_re = re.compile('(\w+)', flags=re.UNICODE) # Chưa chính xác.

	parsed_rss = []
	parsed_links = []

	stop = False

	main_url = 'http://vnexpress.net'
	main_soup = BeautifulSoup(requests.get(main_url + '/rss').content)
	session = requests.Session()

	# Lọc tất cả link trong trang RSS gốc để tìm link RSS.
	for a in main_soup.find_all('a'):
	if 'href' in a.attrs and rss_re.match(a['href']):
	rss_link = main_url + a['href']

	if rss_link not in parsed_rss:
	print('Parsing RSS: ' + rss_link)
	feed = feedparser.parse(rss_link)
	items = feed['items']

	for item in items:
	try:
	link = item['link']
	if link in parsed_links:
	continue

	print('Parsing article: ' + link)
	content = session.get(link).content
	soup = BeautifulSoup(content)

	for tag in soup.find_all('p'):
	# Lấy nội dung bài tin tức (class='Normal').
	if 'class' in tag.attrs and 'Normal' in tag['class']:
	if tag.string:
	for word in word_re.findall(tag.string):
	corpus_file.write(word + ' ')
	corpus_file.write('\n')

	parsed_links.append(link)
	if len(parsed_links) == MAX_LINKS:
	stop = True
	break
	except:
	print('Error, skipping...')
	continue
	if stop:
	break
	parsed_rss.append(rss_link)
	if stop:
	break

	corpus_file.close()
	print('\n\nParsed a total of {0} articles.'.format(len(parsed_links)))
No results found