Last active
April 28, 2024 08:21
-
-
Save svmihar/576b72f468e517123b53191f7077083d to your computer and use it in GitHub Desktop.
scrape seluruh paragraf berita yang ada di indeks.kompas.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
# url = 'https://indeks.kompas.com/' | |
# url='https://indeks.kompas.com/all/2019-04-01/2' | |
url = 'https://indeks.kompas.com/all/2019-04-01/3' | |
req = requests.get(url) | |
# print(req.text) | |
soup = BeautifulSoup(req.text, 'lxml') | |
a = soup.find_all('a',{'class':'article__link'}) | |
kumpulan_link = [] | |
kumpulan_paragraf = [] | |
for link in a: | |
kumpulan_link.append(link['href']) | |
for link in kumpulan_link: | |
halaman = requests.get(link) | |
soup_baru = BeautifulSoup(halaman.text,'lxml') | |
paragraf = soup_baru.find_all('p') | |
for kalimat in paragraf: | |
kumpulan_paragraf.append(kalimat.text) | |
with open('paragraf.txt', 'a') as f: | |
for paragraf in kumpulan_paragraf: | |
print('penulisan berhasil') | |
f.writelines(paragraf + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thanks