Created
November 4, 2012 09:49
-
-
Save wynemo/4011104 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| PyRSS2Gen==1.1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| http://www.imdb.com/title/tt0096697/ | |
| http://www.imdb.com/title/tt1830617/ | |
| http://www.imdb.com/title/tt0121955/ | |
| http://www.imdb.com/title/tt0944947/ | |
| http://www.imdb.com/title/tt1592154/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import json | |
| import urllib2 | |
| import re | |
| import uuid | |
| import PyRSS2Gen | |
| from datetime import datetime | |
| items_re = re.compile(r'<item>.+</item>', re.S) | |
| def get_now(): | |
| date = datetime.now() | |
| year = str(date.year) | |
| month = str(date.month) | |
| day = str(date.day) | |
| return ''.join([year, '-', month, '-', day]) | |
| simpletv_config = 'simpletv.config' | |
| all_history = 'all_history.json' | |
| simpletv_xml = 'simpletv.xml' | |
| header_pattern1 = r'<td\s*id="overview-top">.+?</td>' | |
| header_pattern2 = r'<h1.+?>(.+?)<' | |
| header_re1 = re.compile(header_pattern1, re.S) | |
| header_re2 = re.compile(header_pattern2, re.S) | |
| detail_pattern1 = r'''<div\s*class="odd">.*?<a([^>]+)>([^<]+?)</a>\s*(\(.+?\))''' | |
| detail_pattern2 = r'href="(.+?)"' | |
| detail_pattern3 = r'<h4[^>]*?>[^<]*?</h4>\s*<time.*?datetime="(.+?)"\s*>' | |
| detail_re1 = re.compile(detail_pattern1, re.S) | |
| detail_re2 = re.compile(detail_pattern2, re.S) | |
| detail_re3 = re.compile(detail_pattern3, re.S) | |
| agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:10.0.2) ' | |
| 'Gecko/20100101 Firefox/10.0.2' | |
| rss_header = '''<?xml version="1.0" encoding="iso-8859-1"?> | |
| <rss version="2.0"><channel> | |
| <title>simpletv</title><link>http://www.dabin.info</link> | |
| <description>self tv</description> | |
| <lastBuildDate>%s</lastBuildDate>''' | |
| rss_foot = '''</channel></rss>''' | |
| def main(): | |
| f = None | |
| config_strs = [] | |
| try: | |
| f = open(simpletv_config, 'rb') | |
| for each in f: | |
| each = each.strip() | |
| if each: | |
| config_strs.append(each) | |
| except: | |
| pass | |
| finally: | |
| if f is not None: | |
| f.close() | |
| f = None | |
| o = None | |
| try: | |
| f = open(all_history, 'rb') | |
| o = json.load(f) | |
| except: | |
| pass | |
| finally: | |
| if f is not None: | |
| f.close() | |
| if o is None: | |
| o = {} | |
| for url in config_strs: | |
| try: | |
| header = None | |
| date = None | |
| num = None | |
| name = None | |
| link = None | |
| opener = urllib2.build_opener() | |
| opener.addheaders = [('User-agent', agent)] | |
| i1 = opener.open(url, timeout = 15) | |
| s1 = i1.read() | |
| o1 = re.search(header_re1, s1) | |
| if o1 is not None: | |
| o2 = re.search(header_re2, o1.group()) | |
| if o2 is not None: | |
| header = o2.group(1).strip() | |
| print header | |
| o1 = re.search(detail_re1, s1) | |
| print 'o1:',o1 is None | |
| if o1 is not None: | |
| s2 = o1.group(1) | |
| name = o1.group(2) | |
| num = o1.group(3) | |
| o2 = re.search(detail_re2, s2) | |
| print 'o2 is None ',o2 | |
| if o2 is not None: | |
| link = 'http://www.imdb.com' + o2.group(1) | |
| opener = urllib2.build_opener() | |
| opener.addheaders = [('User-agent', agent)] | |
| i1 = opener.open(link, timeout = 15) | |
| s1 = i1.read() | |
| _date = re.search(detail_re3, s1) | |
| print '_date:', _date is None | |
| if _date is not None: | |
| date = _date.group(1).strip() | |
| if header is None or\ | |
| date is None or\ | |
| num is None or\ | |
| name is None or\ | |
| link is None: | |
| continue | |
| if not date in o: | |
| o[date] = {} | |
| if not header in o[date]: | |
| o[date][header] = {} | |
| o[date][header]['num'] = num | |
| o[date][header]['name'] = name | |
| o[date][header]['link'] = link | |
| except: | |
| continue | |
| new_rss_items = [] | |
| now_time = get_now() | |
| #now_time = '2012-11-16' | |
| print 'now_time is',now_time | |
| print o | |
| print now_time | |
| if now_time in o: | |
| print 'found' | |
| for each in o[now_time]: | |
| used = o[now_time][each].get('used') | |
| if used is None: | |
| o[now_time][each]['used'] = '1' | |
| rss_item = PyRSS2Gen.RSSItem( | |
| title = each + ' ' + o[now_time][each]['num'][2:-1], | |
| description = o[now_time][each]['name'], | |
| link = o[now_time][each]['link'], | |
| guid = PyRSS2Gen.Guid(str(uuid.uuid1()), 0), | |
| pubDate = datetime.now()) | |
| item = re.search(items_re, rss_item.to_xml()) | |
| if item is not None: | |
| new_rss_items.append(item.group()) | |
| else: | |
| print 'error get find item' | |
| f = None | |
| xml = '' | |
| try: | |
| f = open(simpletv_xml, 'rb') | |
| xml = f.read() | |
| except: | |
| pass | |
| finally: | |
| if f is not None: | |
| f.close() | |
| if new_rss_items: | |
| items_text = '' | |
| o1 = re.search(items_re, xml) | |
| if o1 is not None: | |
| items_text = o1.group() | |
| header = rss_header % PyRSS2Gen._format_date(datetime.now()) | |
| body = items_text | |
| foot = rss_foot | |
| f = None | |
| try: | |
| f = open(simpletv_xml, 'w+b') | |
| f.write(header + ''.join(new_rss_items) + body + foot) | |
| except: | |
| pass | |
| finally: | |
| if f is not None: | |
| f.close() | |
| f = None | |
| try: | |
| f = open(all_history, 'w+b') | |
| json.dump(o, f, indent = 4) | |
| except: | |
| pass | |
| finally: | |
| if f is not None: | |
| f.close() | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment