Created
May 2, 2016 10:22
-
-
Save FGFW/20c92c2cf0f189a2b7c10f4952ef180c to your computer and use it in GitHub Desktop.
python采集55188.com论坛URL.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| python采集55188.com论坛URL.py | |
| http://bbs.bathome.net/thread-40289-1-1.html | |
| 2016年5月2日 17:28:47 codegay | |
| 参考资料: Python3学习笔记(urllib模块的使用) | |
| http://www.cnblogs.com/Lands-ljk/p/5447127.html | |
| """ | |
| print("程序运行中...") | |
| from urllib import request | |
| from time import sleep | |
| import re | |
| rooturl="http://www.55188.com/" | |
| forumurl="http://www.55188.com/forum-111-{}.html" | |
| tidurl="http://www.55188.com/viewthread.php?tid={}\n" | |
| with open("result.txt","w+") as f: | |
| for r in range(1,210): | |
| r=request.urlopen(forumurl.format(r)) | |
| txt=r.read().decode("gbk") | |
| rec=re.compile('''viewthread.php\?tid=(\d+).+#anchorlink''') | |
| tid=set(rec.findall(txt))#去重复 | |
| for id in tid: | |
| f.write(tidurl.format(id)) | |
| #sleep(1) | |
| input("运行结束,回车退出") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment