Last active
February 1, 2021 03:09
-
-
Save rririanto/85a08f8376dde58bf1df4818bc36e2e4 to your computer and use it in GitHub Desktop.
scrape ads
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
1. Counter digunakan untuk menjumlahkan berapa banyak looping yang terjadi, dimana angka tersebut bisa kita gunakan | |
Untuk berbagai keperluan. | |
misalnya terdapat url yang punya page_number dan kita ingin mendapatkan halaman 1,2,3 dst | |
maka website.com/jobs?keywords=python&page_num=<hasil di taruh di sini 1,2,3> | |
hasilnya | |
website.com/jobs?keywords=python&page_num=1 | |
website.com/jobs?keywords=python&page_num=2 | |
website.com/jobs?keywords=python&page_num=3 | |
''' | |
page_number = 1 | |
for post in response.css('a.job-item'): | |
linkads = f"https://jupiter.jora.com/api/v1/jobs?keywords=C%C3%B4ng%20ngh%E1%BB%87%20th%C3%B4ng%20tin&page_num={page_number}&session_id=1f4498b9c6f2ebda3cd5dcdf8ef6b15f&search_id=3yAkpixVHSHokFUnNESz-1f4498b9c6f2ebda3cd5dcdf8ef6b15f-X86gxLy3TuLx42PSU59a&session_type=web&user_id=3yAkpixVHSHokFUnNESz&logged_user=false&mobile=false&site_id=1&country=VN&host=https://jupiter.jora.com&full_text_only_search=true&ads_per_page=5&callback=_jsonp_0" | |
page_number += 1 # tambahkan page number 1 berarti page_number sekarang = 2, ini di gunakan di linkads page_num=1 page_num=2 dst | |
''' | |
2. Jika hasil bersifat json maka terkadang perlu di konversi ke json dengan menggunakan lib json | |
Jika kita memerhatikan output https://jupiter.jora.com/api/v1/jobs?keywords=C%C3%B4ng%20ngh%E1%BB%87%20th%C3%B4ng%20tin&page_num=1&session_id=1f4498b9c6f2ebda3cd5dcdf8ef6b15f&search_id=3yAkpixVHSHokFUnNESz-1f4498b9c6f2ebda3cd5dcdf8ef6b15f-X86gxLy3TuLx42PSU59a&session_type=web&user_id=3yAkpixVHSHokFUnNESz&logged_user=false&mobile=false&site_id=1&country=VN&host=https://jupiter.jora.com&full_text_only_search=true&ads_per_page=5&callback=_jsonp_0 | |
maka didapati kata berawalan /**/_jsonp_0( dan juga berakhiran )" dimana ini bukan json sehingga tidak bisa langsung di konversi ke json | |
maka perlu di bersihkan dlu dengan cara | |
''' | |
text_clean = response.text.replace("/**/_jsonp_0(", "") | |
text_clean = response.text.replace(")", "") | |
sehingga bisa d konversi ke json | |
result_json = json.loads(text_clean) | |
print (result_json) | |
''' | |
3. Hal selanjutnya adalah meload list data json dikarenakan list data masih berada dalam key 'ads' | |
''' | |
for data in result_json['ads']: | |
name = data['name'] | |
url = data['url'] | |
yield scrapy.Request(url = url, callback = self.parse_item) | |
''' | |
barulah bisa kita dapat url nya dimana bisa kita gunakan di | |
yield scrapy.Request(url = link, callback = self.parse_item) | |
''' | |
''' | |
script lengkap | |
''' | |
import scrapy | |
import json | |
class PostsSpider(scrapy.Spider): | |
name = "posts" | |
start_urls = { | |
"https://www.jobstreet.vn/j?sp=search&q=C%C3%B4ng+ngh%E1%BB%87+th%C3%B4ng+tin&l" | |
} | |
#{"title" : response.css("#job-info-container > h3::text").get() | |
def parse_item(self, response): | |
item = {} | |
company_name1 = response.css("#company-location-container > span.company::text").get() | |
company_name2 = response.xpath("//*[@id='job-description-container']/div/div/p[17]/b/text()").get() | |
company_name3 = response.css("#job-description-container > div > div > strong ::text").get() | |
if company_name1: | |
#no ads | |
#top | |
item["type"] = "no ads", | |
item["jobtitle"] = response.css("h3.job-title.heading-xxlarge ::text").get(), | |
item["company_name"] = company_name1, | |
item["location"] = response.css("#company-location-container > span.location ::text").get(), | |
item["site"] = response.css("#job-meta > span.site ::text").get(), | |
#desc | |
item["desc"] = ''.join(response.css("#job-description-container ::text").getall()), | |
elif company_name2:#company di bawah | |
#no ads | |
#top | |
item["type"] = "no ads, company name at the bottom side", | |
item["jobtitle"] = response.css("h3.job-title.heading-xxlarge ::text").get(), | |
item["company_name"] = response.xpath("//*[@id='job-description-container']/div/div/p[17]/b/text()").get(), | |
item["location"] = response.css("div #company-location-container > span.location ::text").get(), | |
item["site"] = response.css("div #job-meta > span.site ::text").get(), | |
#desc | |
item["desc"] = ''.join(response.css("#job-description-container ::text").getall()) | |
else: #no description | |
item["type"] = "no ads, no desc", | |
item["jobtitle"] = response.css("h3.job-title.heading-xxlarge ::text").get(), | |
item["company_name"] = company_name3 | |
item["location"] = response.css("#company-location-container > span.location ::text").get(), | |
item["site"] = response.css("#job-meta > span.site ::text").get(), | |
item["desc"] = "no desc" | |
return item | |
def parse_item_json(self, response): | |
text_clean = response.text.replace("/**/_jsonp_0(", "") | |
text_clean = response.text.replace(")", "") | |
result_json = json.loads(text_clean) | |
for data in result_json['ads']: | |
name = data['name'] | |
url = data['url'] | |
yield scrapy.Request(url = url, callback = self.parse_item) | |
def parse(self, response): | |
page_number = 1 | |
for post in response.css('a.job-item'): | |
#linkads=[] | |
#page=1 | |
#linkpart1="https://jupiter.jora.com/api/v1/jobs?keywords=C%C3%B4ng%20ngh%E1%BB%87%20th%C3%B4ng%20tin&page_num=" | |
#linkpart2="&session_id=1f4498b9c6f2ebda3cd5dcdf8ef6b15f&search_id=3yAkpixVHSHokFUnNESz-1f4498b9c6f2ebda3cd5dcdf8ef6b15f-X86gxLy3TuLx42PSU59a&session_type=web&user_id=3yAkpixVHSHokFUnNESz&logged_user=false&mobile=false&site_id=1&country=VN&host=https://jupiter.jora.com&full_text_only_search=true&ads_per_page=5&callback=_jsonp_0" | |
#while page<51: | |
# print(linkpart1+str(page)+linkpart2) | |
# page+=1 | |
# linkads.append(page) | |
data = { | |
#total = 15, ads = 5, non ads = 10 | |
#non ads | |
#"jobtitle" : post.css(".job-item ::attr(title)").get(), | |
"url" : post.css(".job-item ::attr(href)").get() | |
} | |
linkads = f"https://jupiter.jora.com/api/v1/jobs?keywords=C%C3%B4ng%20ngh%E1%BB%87%20th%C3%B4ng%20tin&page_num={page_number}&session_id=1f4498b9c6f2ebda3cd5dcdf8ef6b15f&search_id=3yAkpixVHSHokFUnNESz-1f4498b9c6f2ebda3cd5dcdf8ef6b15f-X86gxLy3TuLx42PSU59a&session_type=web&user_id=3yAkpixVHSHokFUnNESz&logged_user=false&mobile=false&site_id=1&country=VN&host=https://jupiter.jora.com&full_text_only_search=true&ads_per_page=5&callback=_jsonp_0" | |
link = "https://www.jobstreet.vn/" + data.get("url") | |
page_number +=1 | |
if link is not None: | |
yield scrapy.Request(url = link, callback = self.parse_item) | |
yield scrapy.Request(url = linkads, callback = self.parse_item_json) | |
#for a in linkads: | |
# print(a) | |
# #scrapy.Request(url = a, callback = self.parse_item_json) | |
# next_page = response.css("a.next-page-button::attr(href)").get() | |
# if next_page is not None: | |
# next_page = response.urljoin(next_page) | |
# yield scrapy.Request(next_page, callback=self.parse) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment