Skip to content

Instantly share code, notes, and snippets.

@earlwlkr
Created January 29, 2015 10:45
Show Gist options
  • Select an option

  • Save earlwlkr/e0f8fd602ae4f261f8ce to your computer and use it in GitHub Desktop.

Select an option

Save earlwlkr/e0f8fd602ae4f261f8ce to your computer and use it in GitHub Desktop.
Parse RSS content from VnExpress.net
# -*- coding: utf-8 -*-
# Python code to parse news content from VnExpress RSS Feeds.
import os
import re
from bs4 import BeautifulSoup # external lib
import requests # external lib
import feedparser # external lib
MAX_LINKS = 5
corpus_file = open('corpus.txt', mode='a', encoding='utf8')
rss_re = re.compile(r'/rss/[a-z-?]+.rss', flags=re.UNICODE)
word_re = re.compile('(\w+)', flags=re.UNICODE) # Chưa chính xác.
parsed_rss = []
parsed_links = []
stop = False
main_url = 'http://vnexpress.net'
main_soup = BeautifulSoup(requests.get(main_url + '/rss').content)
session = requests.Session()
# Lọc tất cả link trong trang RSS gốc để tìm link RSS.
for a in main_soup.find_all('a'):
if 'href' in a.attrs and rss_re.match(a['href']):
rss_link = main_url + a['href']
if rss_link not in parsed_rss:
print('Parsing RSS: ' + rss_link)
feed = feedparser.parse(rss_link)
items = feed['items']
for item in items:
try:
link = item['link']
if link in parsed_links:
continue
print('Parsing article: ' + link)
content = session.get(link).content
soup = BeautifulSoup(content)
for tag in soup.find_all('p'):
# Lấy nội dung bài tin tức (class='Normal').
if 'class' in tag.attrs and 'Normal' in tag['class']:
if tag.string:
for word in word_re.findall(tag.string):
corpus_file.write(word + ' ')
corpus_file.write('\n')
parsed_links.append(link)
if len(parsed_links) == MAX_LINKS:
stop = True
break
except:
print('Error, skipping...')
continue
if stop:
break
parsed_rss.append(rss_link)
if stop:
break
corpus_file.close()
print('\n\nParsed a total of {0} articles.'.format(len(parsed_links)))
@phuongnm94
Copy link

thanks

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment