Skip to content

Instantly share code, notes, and snippets.

@dauuricus
Created November 14, 2022 13:29
Show Gist options
  • Save dauuricus/c76444d1fabbdd0273ac79e428422d1f to your computer and use it in GitHub Desktop.
Save dauuricus/c76444d1fabbdd0273ac79e428422d1f to your computer and use it in GitHub Desktop.
Youtube clip search
# coding: UTF-8
import urllib.request
import urllib.parse
import re
import time
import json
from bs4 import BeautifulSoup
words = "ransomware"
keywords = urllib.parse.quote(words)
target_url = "https://www.youtube.com/results?search_query=" + keywords
html = urllib.request.urlopen(target_url).read()
html_strings = html.decode()
del(html)
soup = BeautifulSoup(html_strings, 'html.parser')
for ind,script_tag in enumerate(soup.find_all('script')):
if ind == 33:
json_strings = str(script_tag)[58:-10]
del(soup)
j_dict = json.loads(json_strings)
#print(json.dumps(j_dict, indent=2))
jj_dict = {}
for k,v in j_dict.items():
if k == "contents":
for kk,vv in v.items():
for kkk,vvv in vv.items():
if kkk == 'primaryContents':
for kkkk,vvvv in vvv.items():
for kkkkk,vvvvv in vvvv.items():
if kkkkk == 'contents':
jj_dict = vvvvv[0]
#print(vvvvv[1])
videoid_list = []
title_list = []
durations_list = []
channelid_list = []
for k,v in jj_dict.items():
for kk,vv in v.items():
if kk == 'contents':
for ind,vvv in enumerate(vv):
if(ind == len(vv)-1):
print(ind)
print()
else:
print(ind,end=", ")
match1 = re.search('videoId',str(vvv))
video_id = ""
if match1 != None:
startindex = match1.start(0)
endindex = match1.end(0)
video_id = str(vvv)[(startindex + 11):(endindex + 15)]
videoid_list.append(video_id)
else:
videoid_list.append(video_id)
match2 = re.search('\'title\':\s\{\'runs\':\s\[\{\'text\':\s[\'|\"](.*?)[\'|\"]\}\],',str(vvv))
title = ""
if match2 != None:
#startindex = match2.start(0)
#endindex = match2.end(0)
title = match2.group(1) #str(vvv)[startindex:endindex]
title_list.append(title)
else:
title_list.append(title)
match3 = re.search('\'lengthText\':\s\{\'accessibility\':\s\{\'accessibilityData\':\s\{\'label\':\s\'.*?\'\}\},\s\'simpleText\':\s\'(\d+?:\d+?|\d+?:\d+?:\d+?)\'\},',str(vvv))
durations = ""
if match3 != None:
durations = match3.group(1)
durations_list.append(durations)
else:
durations_list.append(durations)
del(match3,match2,match1)
# match4 = re.search('\{[\'|\"]url[\'|\"]:\s[\'|\"]\/channel\/(.+?)[\'|\"],',str(vvv))
# channel_id = ""
# if match4 != None:
# channel_id = match4.group(1)
# channelid_list.append(channel_id)
# print(len(videoid_list),video_id)
# print(len(title_list),title)
# print(len(durations_list),durations)
## print(len(channelid_list),channel_id)
ziped = zip(videoid_list,title_list,durations_list)
#for ind,x in enumerate(ziped):
# print (ind,x)
# print ()
# check values
search_list_temporary = [x for x in ziped if x[0] != '' and x[1] != '' and x[2] != '' ]
del(ziped)
######## ##### ###### ###### ###### ###### ###### ######
#
#for ind,p in enumerate(search_list_temporary):
# print(ind, p)
#print()
#
######## ##### ###### ###### ###### ###### ###### ######
# check keywords in video title
search_list = [ x for x in search_list_temporary if re.search(rf'(?i){words}',x[1]) ]
del(search_list_temporary)
#for ind,p in enumerate(search_list):
# print(ind, p)
#print()
#
######## ##### ###### ###### ###### ###### ###### ######
video_list_0 = []
already_list = []
total_list = []
id_cell_1 = {}
id_cell_2 = {}
video_list_0.extend(search_list)
def first_gether(words, target_url, total_list, id_cell_1, id_cell_2):
id_cell_seconds = {}
id_cell_title = {}
ommit = target_url.replace("https://www.youtube.com/watch?v=",'')
html = urllib.request.urlopen(target_url).read()
html_strings = html.decode()
del(html)
soup = BeautifulSoup(html_strings, 'html.parser')
for ind,script_tag in enumerate(soup.find_all('script')):
if ind == 40:
json_strings = str(script_tag)[58:-10]
del(soup)
j_dict = json.loads(json_strings)
videoid_list = []
title_list = []
durations_list = []
channelid_list = []
for k,v in j_dict.items():
# if k == 'currentVideoEndpoint':
# continue
# print('currentVideoEndpoint:',v)
# if k == 'trackingParams':
# continue
# print('trackingParams:',v)
# if k == 'onResponseReceivedEndpoints':
# continue
# print('onResponseReceivedEndpoints:',v)
# if k == 'engagementPanels':
# continue
# print('engagementPanels:',v)
# if k == 'topbar':
# continue
# print('topbar:',v)
if k == 'contents':
for kk,vv in v.items():
for kkk,vvv in vv.items():
if kkk == 'results':
continue
if kkk == 'secondaryResults':
match1 = re.findall('\'videoId\':\s\'(.+?)\',',str(vvv))
video_id = ""
if match1 != None:
videoid_list.extend(sorted(list(set(match1)),key=match1.index))
videoid_list.remove(ommit)
else:
videoid_list.append(video_id)
match2 = re.findall('\'title\':\s\{\'accessibility\':\s\{\'accessibilityData\':\s\{\'label\':\s[\'|\"](.*?)[\'|\"]\}\},', str(vvv))
title = ""
if match2 != None:
title_list.extend(list(match2))
else:
title_list.append(title)
del(match2,match1)
# print(len(videoid_list))
# for i,v in enumerate(videoid_list):
# print(i,v)
# print()
# print(len(title_list))
# for i,v in enumerate(title_list):
# print(i,v)
# print()
ziped2 = zip(videoid_list,title_list)
ziped2 = [ x for x in ziped2 if re.search(rf'(?i){words}',x[1])]
# for i,v in enumerate(ziped2):
# print(i,v)
video_list = []
id_data = [x[0] for x in ziped2]
video_list.extend(id_data)
### video_ids #################################
# video_ids = re.findall(r"watch\?v=(\S{11})", html_strings)
#
# omitlist = []
# if video_ids is not None:
# offset = 0
# for ind,idcode in enumerate(video_ids):
# mmmm = re.search(rf'{idcode}',html_strings[offset:-1])
# endindex = mmmm.end(0)
# maybetitle = html_strings[endindex:(endindex + 1400)]
# #print(ind,idcode)
# #print(maybetitle)
# kakawari = re.search(rf'(?i){words}',maybetitle)
# if kakawari is None:
# omitlist.append(video_ids[ind])
#
# if len(omitlist) > 0:
# for omit in omitlist:
# video_ids = [ x for x in video_ids if x != omit ]
#
# del(omitlist)
# id_data = list(set(video_ids))
# del(video_ids)
#
# video_list.extend(id_data)
# video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data]
# del(id_data)
#################################
title = re.search(r"(?<=\<title\>).*?(?=\<\/title\>)",html_strings)
if title is None:
print("skip")
else:
title_strings = title.group()
kakawari = re.search(rf'(?i){words}',title_strings)
if kakawari is None:
print()
print(title_strings)
print("skip (not relevant)")
print()
pass
else:
total_list.append(target_url)
idxxx = target_url.replace("https://www.youtube.com/watch\?v=","")
id_cell_title[idxxx] = title_strings
print()
print('!!!',title_strings)
print('@@@',target_url)
keyword2 = re.search(r'"channelId" content=',html_strings)
if keyword2 is None:
pass
else:
channelid = keyword2.end(0)
print("channelId",html_strings[channelid:(channelid + 26)])
id_cell_1[idxxx] = html_strings[channelid:(channelid + 26)]
keyword3 = re.search(r'"datePublished" content=',html_strings)
if keyword3 is None:
pass
else:
published = keyword3.end(0)
print("datePublished",html_strings[published:(published + 12)])
keyword4 = re.search(r'"uploadDate" content=',html_strings)
if keyword4 is None:
pass
else:
uploaded = keyword4.end(0)
print("uploadDate",html_strings[uploaded:(uploaded + 12)])
id_cell_2[idxxx] = html_strings[uploaded:(uploaded + 12)]
keywords5 = re.search(r'"lengthSeconds":"(\d+)",',html_strings)
if keywords5 is None:
pass
else:
seconds = keywords5.group(1)
print("seconds",seconds)
id_cell_seconds[idxxx] = seconds
# for ind,p in enumerate(video_list):
# print(ind,p)
return video_list, total_list, id_cell_1, id_cell_2, id_cell_seconds, id_cell_title
def second_gether(words, x, video_list_0, already_list, total_list, id_cell_1, id_cell_2):
url_list = [ "https://www.youtube.com/watch?v=" + str(x[0]) for x in video_list_0 ]
video_list_0.clear()
if len(url_list) > 0:
sum_list = []
rem = []
for i,url in enumerate(url_list):
if not re.match(r'^http',url):
# if not re.match(r'^@',url):
continue
# url = url.replace('@ ',"https://www.youtube.com/watch\?v=")
videoid = url.replace("https://www.youtube.com/watch\?v=","")
if videoid in id_cell_1:
print("----------------")
print(x,';',(i + 1),';',"skip ")
print("----------------")
continue
if url in already_list:
print()
print(x,';',(i + 1),';',"skip ")
print(" (not relevant)")
rem.append(url)
continue
# print("----------------")
# print(x,';',(i + 1),';',url)
try:
video_list, total_list, id_cell_1, id_cell_2, id_cell_seconds, id_cell_title = first_gether(words, url, total_list, id_cell_1, id_cell_2)
except:
time.sleep(1)
continue
already_list.append(url)
# l o g
### ## ## ## ## ## ## ## ## ## ## ## ## ## ##
if videoid in id_cell_1 and videoid in id_cell_2:
with open('log.txt', 'a') as out:
try:
out.write( "vi_id: " + videoid + " up_id: " + id_cell_2[videoid] + " ch_id: " + id_cell_1[videoid] + " second: " + id_cell_seconds[videoid] + " title: " + id_cell_title[videoid] + "\n")
except:
print("file write error")
out.close()
del(id_cell_seconds, id_cell_title) # from first_gether function return values
### ## ## ## ## ## ## ## ## ## ## ## ## ## ##
if ( len(video_list) > 0 ):
# video_list = list(set(video_list))
sum_list.extend(video_list)
total_list = list(set(total_list))
if ( len(rem) > 0 ):
for remove in rem:
if remove in sum_list:
inum = sum_list.index(remove)
sum_list.pop(inum)
new_list = sorted(list(set(sum_list)),key=sum_list.index)
tempo_list_ = [ "" for x in new_list]
ziped = zip(new_list,tempo_list_,tempo_list_)
video_list_0.extend(ziped)
# for ind,p in enumerate(video_list_0):
# print(ind,p)
x = x + 1
if ( x < 4000 and len(new_list) > 0 ):
video_list, already_list, total_list, id_cell_1, id_cell_2 = second_gether(words, x, video_list_0, already_list, total_list, id_cell_1, id_cell_2)
return video_list,already_list,total_list,id_cell_1,id_cell_2
# top level
counter_x = 0
video_list_0, already_list, total_list, id_cell_1, id_cell_2 = second_gether(words, counter_x, video_list_0, already_list, total_list, id_cell_1, id_cell_2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment