Skip to content

Instantly share code, notes, and snippets.

@xiangze
Last active August 29, 2015 14:02
Show Gist options
  • Save xiangze/35e35c5740f0b928e84b to your computer and use it in GitHub Desktop.
Save xiangze/35e35c5740f0b928e84b to your computer and use it in GitHub Desktop.
Minshuu scraping http://www.nikki.ne.jp/
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 07 20:38:05 2014
@author: xiangze
"""
from BeautifulSoup import BeautifulSoup
import urllib2
import time
suf="1"
fname=open("alllist_"+suf+".txt")
fw=open("minshu_rellist_"+suf+".txt","w")
fwid=open("minshu_rellist_id_"+suf+".txt","w")
flist=fname.readlines()
fname.close()
print len(flist)
#flist=[flist[0]]
for l in flist:
f=urllib2.urlopen(l)
c = f.headers.getparam('charset')
try:
s=unicode(f.read(), c)
soup =BeautifulSoup(s)
compname=soup.find("h1",{"class":"bbsHeadingText"}).find("a").text
relsname=[i.find("a").text for i in soup.findAll("li",{"class":"number"})]
relsid=[i.find("a").get("href") for i in soup.findAll("li",{"class":"number"})]
lid=l.replace("http://www.nikki.ne.jp/bbs/","").replace("/\n","")
for r in relsname:
print >>fw,compname+","+r
for r in relsid:
print >>fwid,lid+","+r.replace("/bbs/","").replace("/","")
time.sleep(1)
except:
print "error"+l
for i in 10 20 30 40 50 ;do
for j in `seq 1 46`;do
for l in 0 50 100 150 200 250 300;do
q="http://www.nikki.ne.jp/?action=bbs&bbs_id=${i}&initial=${j}&limit=${l}"
wget -O list_${i}_${j}_${l}.html $q
done
done
done
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 07 19:36:11 2014
@author: xiangze
"""
import glob
from BeautifulSoup import BeautifulSoup
import os
ll=glob.glob("list_*_*_*.html")
#ll=glob.glob("list_30_1_0.html")
urltrunk="http://www.nikki.ne.jp"
fw=open("alllist.txt","w")
for fname in ll:
soup =BeautifulSoup(open(fname))
links=[l.find("a").get("href") for l in soup.findAll("li",{"class":"normal"})]
for i in links:
print >>fw,urltrunk+i
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment