Created
July 10, 2016 16:26
-
-
Save hahn/9ff5dcdba230ad95095e5e4a29c3df11 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
from bs4 import BeautifulSoup | |
import time | |
from datetime import date | |
url = "http://indeks.kompas.com/indeks/index/news" | |
#ambil hari ini | |
today = date.today().strftime("%d%m%Y") | |
filename = "brt-%s.txt" % today | |
filelink = "link-%s.txt" % today | |
def getUrl(url): | |
'''ambil url tiap berita di indeks''' | |
tautan = [] | |
url = urllib.urlopen(url) | |
result = url.read() | |
url.close() | |
soup = BeautifulSoup(result, "html.parser") | |
for link in soup.find_all('h3'): | |
for l in link.find_all('a'): | |
isi = l.get('href') | |
tautan.append(isi) | |
return tautan | |
def getIndeks(url): | |
i = 1 | |
tautans = [] | |
tautan = getUrl(url) | |
while (len(tautan) != 0): | |
url = "http://indeks.kompas.com/indeks/index/news?p=%d" %i | |
print url | |
i += 1 | |
tautan = getUrl(url) | |
tautans += tautan | |
print len(tautans) | |
bf = open(filelink, "w") | |
for t in tautans: | |
bf.write(t) | |
bf.write("\n") | |
bf.close() | |
def getBerita(url): | |
url = urllib.urlopen(url) | |
result = url.read() | |
url.close() | |
isiberita = "" | |
soup = BeautifulSoup(result, "html.parser") | |
for berita in soup.find_all('div', {'class':'kcm-read-text'}): | |
isiberita += berita.get_text().encode("utf-8") | |
bf = open(filename, 'a') | |
bf.write(isiberita) | |
bf.write("\n") | |
bf.close() | |
def getLinkFromFile(): | |
'''Ambil berita dari link yang sudah disimpan di link-tgl.txt''' | |
print "Ambil berita dari link tanggal %s. " % today | |
bf = open(filelink, 'r') | |
i = 0 | |
for l in bf: | |
print l | |
getBerita(l) | |
bf.close() | |
#mainkan | |
getIndeks(url) | |
getLinkFromFile() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment