Created
August 29, 2021 08:03
-
-
Save behitek/51b64de7b551eca0e1f350f737d09bce to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import re | |
import sys | |
import time | |
import traceback | |
from datetime import datetime | |
import requests | |
def get_song_id(url): | |
try: | |
r = requests.get(url) | |
if r.status_code == 200: | |
songids = re.findall(r"var songid = \"\d+\";", r.text) | |
if len(songids) > 0: | |
song_id = re.sub(r'.+"(\d+).+', r'\1', songids[0]) | |
return song_id | |
else: | |
print('Got status code {}, retry ...'.format(r.status_code)) | |
time.sleep(100) | |
return get_song_id(url) | |
except: | |
traceback.print_exc() | |
time.sleep(1000) | |
return get_song_id(url) | |
return None | |
def get_view_by_song_id(song_id): | |
try: | |
r = requests.get( | |
'https://www.nhaccuatui.com/interaction/api/counter?jsoncallback=njf020493390263610209_1630061265092&listSongIds={}'.format( | |
song_id)) | |
if r.status_code == 200: | |
view_count = re.sub(r'.+\"{}\":(\d+).+'.format(song_id), r'\1', r.text) | |
if re.match(r"\d+", view_count): | |
return view_count | |
return 0 | |
else: | |
print('Got status code {}, retry ...'.format(r.status_code)) | |
time.sleep(100) | |
return get_view_by_song_id(song_id) | |
except: | |
traceback.print_exc() | |
time.sleep(1000) | |
return get_view_by_song_id(song_id) | |
input = sys.argv[1] | |
output = sys.argv[1] + '.out' | |
start_line = int(sys.argv[2]) | |
print('Start line', start_line) | |
line_number = 1 | |
for line in open(input): | |
if line_number < start_line: | |
line_number += 1 | |
continue | |
item = json.loads(line) | |
url = item['url'] | |
song_id = get_song_id(url) | |
item['song_id'] = song_id | |
view_count = get_view_by_song_id(song_id) | |
item['view_count'] = view_count | |
item['crawled_time'] = datetime.today().strftime('%Y-%m-%d-%H:%M:%S') | |
if line_number % 1000 == 0: | |
print(item['crawled_time'], 'Line ', line_number) | |
line_number += 1 | |
with open(output, 'a+') as fp: | |
fp.write(json.dumps(item, ensure_ascii=False) + '\n') | |
print('Total line', line_number) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment