Skip to content

Instantly share code, notes, and snippets.

@rririanto
Last active February 1, 2021 03:09
Show Gist options
  • Save rririanto/85a08f8376dde58bf1df4818bc36e2e4 to your computer and use it in GitHub Desktop.
Save rririanto/85a08f8376dde58bf1df4818bc36e2e4 to your computer and use it in GitHub Desktop.
scrape ads
'''
1. Counter digunakan untuk menjumlahkan berapa banyak looping yang terjadi, dimana angka tersebut bisa kita gunakan
Untuk berbagai keperluan.
misalnya terdapat url yang punya page_number dan kita ingin mendapatkan halaman 1,2,3 dst
maka website.com/jobs?keywords=python&page_num=<hasil di taruh di sini 1,2,3>
hasilnya
website.com/jobs?keywords=python&page_num=1
website.com/jobs?keywords=python&page_num=2
website.com/jobs?keywords=python&page_num=3
'''
page_number = 1
for post in response.css('a.job-item'):
linkads = f"https://jupiter.jora.com/api/v1/jobs?keywords=C%C3%B4ng%20ngh%E1%BB%87%20th%C3%B4ng%20tin&page_num={page_number}&session_id=1f4498b9c6f2ebda3cd5dcdf8ef6b15f&search_id=3yAkpixVHSHokFUnNESz-1f4498b9c6f2ebda3cd5dcdf8ef6b15f-X86gxLy3TuLx42PSU59a&session_type=web&user_id=3yAkpixVHSHokFUnNESz&logged_user=false&mobile=false&site_id=1&country=VN&host=https://jupiter.jora.com&full_text_only_search=true&ads_per_page=5&callback=_jsonp_0"
page_number += 1 # tambahkan page number 1 berarti page_number sekarang = 2, ini di gunakan di linkads page_num=1 page_num=2 dst
'''
2. Jika hasil bersifat json maka terkadang perlu di konversi ke json dengan menggunakan lib json
Jika kita memerhatikan output https://jupiter.jora.com/api/v1/jobs?keywords=C%C3%B4ng%20ngh%E1%BB%87%20th%C3%B4ng%20tin&page_num=1&session_id=1f4498b9c6f2ebda3cd5dcdf8ef6b15f&search_id=3yAkpixVHSHokFUnNESz-1f4498b9c6f2ebda3cd5dcdf8ef6b15f-X86gxLy3TuLx42PSU59a&session_type=web&user_id=3yAkpixVHSHokFUnNESz&logged_user=false&mobile=false&site_id=1&country=VN&host=https://jupiter.jora.com&full_text_only_search=true&ads_per_page=5&callback=_jsonp_0
maka didapati kata berawalan /**/_jsonp_0( dan juga berakhiran )" dimana ini bukan json sehingga tidak bisa langsung di konversi ke json
maka perlu di bersihkan dlu dengan cara
'''
text_clean = response.text.replace("/**/_jsonp_0(", "")
text_clean = response.text.replace(")", "")
sehingga bisa d konversi ke json
result_json = json.loads(text_clean)
print (result_json)
'''
3. Hal selanjutnya adalah meload list data json dikarenakan list data masih berada dalam key 'ads'
'''
for data in result_json['ads']:
name = data['name']
url = data['url']
yield scrapy.Request(url = url, callback = self.parse_item)
'''
barulah bisa kita dapat url nya dimana bisa kita gunakan di
yield scrapy.Request(url = link, callback = self.parse_item)
'''
'''
script lengkap
'''
import scrapy
import json
class PostsSpider(scrapy.Spider):
name = "posts"
start_urls = {
"https://www.jobstreet.vn/j?sp=search&q=C%C3%B4ng+ngh%E1%BB%87+th%C3%B4ng+tin&l"
}
#{"title" : response.css("#job-info-container > h3::text").get()
def parse_item(self, response):
item = {}
company_name1 = response.css("#company-location-container > span.company::text").get()
company_name2 = response.xpath("//*[@id='job-description-container']/div/div/p[17]/b/text()").get()
company_name3 = response.css("#job-description-container > div > div > strong ::text").get()
if company_name1:
#no ads
#top
item["type"] = "no ads",
item["jobtitle"] = response.css("h3.job-title.heading-xxlarge ::text").get(),
item["company_name"] = company_name1,
item["location"] = response.css("#company-location-container > span.location ::text").get(),
item["site"] = response.css("#job-meta > span.site ::text").get(),
#desc
item["desc"] = ''.join(response.css("#job-description-container ::text").getall()),
elif company_name2:#company di bawah
#no ads
#top
item["type"] = "no ads, company name at the bottom side",
item["jobtitle"] = response.css("h3.job-title.heading-xxlarge ::text").get(),
item["company_name"] = response.xpath("//*[@id='job-description-container']/div/div/p[17]/b/text()").get(),
item["location"] = response.css("div #company-location-container > span.location ::text").get(),
item["site"] = response.css("div #job-meta > span.site ::text").get(),
#desc
item["desc"] = ''.join(response.css("#job-description-container ::text").getall())
else: #no description
item["type"] = "no ads, no desc",
item["jobtitle"] = response.css("h3.job-title.heading-xxlarge ::text").get(),
item["company_name"] = company_name3
item["location"] = response.css("#company-location-container > span.location ::text").get(),
item["site"] = response.css("#job-meta > span.site ::text").get(),
item["desc"] = "no desc"
return item
def parse_item_json(self, response):
text_clean = response.text.replace("/**/_jsonp_0(", "")
text_clean = response.text.replace(")", "")
result_json = json.loads(text_clean)
for data in result_json['ads']:
name = data['name']
url = data['url']
yield scrapy.Request(url = url, callback = self.parse_item)
def parse(self, response):
page_number = 1
for post in response.css('a.job-item'):
#linkads=[]
#page=1
#linkpart1="https://jupiter.jora.com/api/v1/jobs?keywords=C%C3%B4ng%20ngh%E1%BB%87%20th%C3%B4ng%20tin&page_num="
#linkpart2="&session_id=1f4498b9c6f2ebda3cd5dcdf8ef6b15f&search_id=3yAkpixVHSHokFUnNESz-1f4498b9c6f2ebda3cd5dcdf8ef6b15f-X86gxLy3TuLx42PSU59a&session_type=web&user_id=3yAkpixVHSHokFUnNESz&logged_user=false&mobile=false&site_id=1&country=VN&host=https://jupiter.jora.com&full_text_only_search=true&ads_per_page=5&callback=_jsonp_0"
#while page<51:
# print(linkpart1+str(page)+linkpart2)
# page+=1
# linkads.append(page)
data = {
#total = 15, ads = 5, non ads = 10
#non ads
#"jobtitle" : post.css(".job-item ::attr(title)").get(),
"url" : post.css(".job-item ::attr(href)").get()
}
linkads = f"https://jupiter.jora.com/api/v1/jobs?keywords=C%C3%B4ng%20ngh%E1%BB%87%20th%C3%B4ng%20tin&page_num={page_number}&session_id=1f4498b9c6f2ebda3cd5dcdf8ef6b15f&search_id=3yAkpixVHSHokFUnNESz-1f4498b9c6f2ebda3cd5dcdf8ef6b15f-X86gxLy3TuLx42PSU59a&session_type=web&user_id=3yAkpixVHSHokFUnNESz&logged_user=false&mobile=false&site_id=1&country=VN&host=https://jupiter.jora.com&full_text_only_search=true&ads_per_page=5&callback=_jsonp_0"
link = "https://www.jobstreet.vn/" + data.get("url")
page_number +=1
if link is not None:
yield scrapy.Request(url = link, callback = self.parse_item)
yield scrapy.Request(url = linkads, callback = self.parse_item_json)
#for a in linkads:
# print(a)
# #scrapy.Request(url = a, callback = self.parse_item_json)
# next_page = response.css("a.next-page-button::attr(href)").get()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment