Skip to content

Instantly share code, notes, and snippets.

@laymonage
Last active July 18, 2018 08:42
Show Gist options
  • Save laymonage/783146c4abcc606470b26e45496f73cf to your computer and use it in GitHub Desktop.
Save laymonage/783146c4abcc606470b26e45496f73cf to your computer and use it in GitHub Desktop.
import os
import requests
from bs4 import BeautifulSoup
def scrape(link):
'''
Fungsi rekursif untuk scraping blog punkindonesiamp3.
'''
result = ''
req = requests.get(link)
sup = BeautifulSoup(req.text, 'html.parser')
semua_post = sup.find_all('div', {'class': 'post hentry'})
for post in semua_post:
nama = post.find('h3').text.strip() # Judul post
labels = post.find_all('a', {'rel': 'tag'}) # Label post
kota = ''
genre = []
related = []
for label in labels:
label = label.text.strip()
if '*' in label:
# Menghilangkan tanda '*' dan kata 'city'
# sekaligus kapitalisasi nama kota
kota = label[1:].replace(' city', '').title()
elif '#' in label:
# Menghilangkan tanda '#' dan menambahkannya ke list genre
genre.append(label[1:])
elif label.lower() != nama.lower():
# Menyimpan label yang lainnya ke sebuah list
related.append(label)
result += ("{},{},{},{}\n"
.format(nama, kota, ';'.join(genre), ';'.join(related)))
older_posts = sup.find('a', 'blog-pager-older-link')
if older_posts:
# Rekursi apabila ada older posts
return result + scrape(older_posts.get('href'))
return result
if __name__ == '__main__':
# Menuliskan hasil ke berkas
with open(os.getcwd() + "\\punk_hasil.csv", 'w') as output:
output.write("Nama grup,Asal,Genre,Terkait\n")
output.write(scrape('http://punkindonesiamp3.blogspot.com'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment