Skip to content

Instantly share code, notes, and snippets.

@behitek
Created August 29, 2021 08:03
Show Gist options
  • Save behitek/51b64de7b551eca0e1f350f737d09bce to your computer and use it in GitHub Desktop.
Save behitek/51b64de7b551eca0e1f350f737d09bce to your computer and use it in GitHub Desktop.
import json
import os
import re
import sys
import time
import traceback
from datetime import datetime
import requests
def get_song_id(url):
try:
r = requests.get(url)
if r.status_code == 200:
songids = re.findall(r"var songid = \"\d+\";", r.text)
if len(songids) > 0:
song_id = re.sub(r'.+"(\d+).+', r'\1', songids[0])
return song_id
else:
print('Got status code {}, retry ...'.format(r.status_code))
time.sleep(100)
return get_song_id(url)
except:
traceback.print_exc()
time.sleep(1000)
return get_song_id(url)
return None
def get_view_by_song_id(song_id):
try:
r = requests.get(
'https://www.nhaccuatui.com/interaction/api/counter?jsoncallback=njf020493390263610209_1630061265092&listSongIds={}'.format(
song_id))
if r.status_code == 200:
view_count = re.sub(r'.+\"{}\":(\d+).+'.format(song_id), r'\1', r.text)
if re.match(r"\d+", view_count):
return view_count
return 0
else:
print('Got status code {}, retry ...'.format(r.status_code))
time.sleep(100)
return get_view_by_song_id(song_id)
except:
traceback.print_exc()
time.sleep(1000)
return get_view_by_song_id(song_id)
input = sys.argv[1]
output = sys.argv[1] + '.out'
start_line = int(sys.argv[2])
print('Start line', start_line)
line_number = 1
for line in open(input):
if line_number < start_line:
line_number += 1
continue
item = json.loads(line)
url = item['url']
song_id = get_song_id(url)
item['song_id'] = song_id
view_count = get_view_by_song_id(song_id)
item['view_count'] = view_count
item['crawled_time'] = datetime.today().strftime('%Y-%m-%d-%H:%M:%S')
if line_number % 1000 == 0:
print(item['crawled_time'], 'Line ', line_number)
line_number += 1
with open(output, 'a+') as fp:
fp.write(json.dumps(item, ensure_ascii=False) + '\n')
print('Total line', line_number)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment