Created
November 14, 2022 13:29
-
-
Save dauuricus/c76444d1fabbdd0273ac79e428422d1f to your computer and use it in GitHub Desktop.
Youtube clip search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: UTF-8 | |
import urllib.request | |
import urllib.parse | |
import re | |
import time | |
import json | |
from bs4 import BeautifulSoup | |
words = "ransomware" | |
keywords = urllib.parse.quote(words) | |
target_url = "https://www.youtube.com/results?search_query=" + keywords | |
html = urllib.request.urlopen(target_url).read() | |
html_strings = html.decode() | |
del(html) | |
soup = BeautifulSoup(html_strings, 'html.parser') | |
for ind,script_tag in enumerate(soup.find_all('script')): | |
if ind == 33: | |
json_strings = str(script_tag)[58:-10] | |
del(soup) | |
j_dict = json.loads(json_strings) | |
#print(json.dumps(j_dict, indent=2)) | |
jj_dict = {} | |
for k,v in j_dict.items(): | |
if k == "contents": | |
for kk,vv in v.items(): | |
for kkk,vvv in vv.items(): | |
if kkk == 'primaryContents': | |
for kkkk,vvvv in vvv.items(): | |
for kkkkk,vvvvv in vvvv.items(): | |
if kkkkk == 'contents': | |
jj_dict = vvvvv[0] | |
#print(vvvvv[1]) | |
videoid_list = [] | |
title_list = [] | |
durations_list = [] | |
channelid_list = [] | |
for k,v in jj_dict.items(): | |
for kk,vv in v.items(): | |
if kk == 'contents': | |
for ind,vvv in enumerate(vv): | |
if(ind == len(vv)-1): | |
print(ind) | |
print() | |
else: | |
print(ind,end=", ") | |
match1 = re.search('videoId',str(vvv)) | |
video_id = "" | |
if match1 != None: | |
startindex = match1.start(0) | |
endindex = match1.end(0) | |
video_id = str(vvv)[(startindex + 11):(endindex + 15)] | |
videoid_list.append(video_id) | |
else: | |
videoid_list.append(video_id) | |
match2 = re.search('\'title\':\s\{\'runs\':\s\[\{\'text\':\s[\'|\"](.*?)[\'|\"]\}\],',str(vvv)) | |
title = "" | |
if match2 != None: | |
#startindex = match2.start(0) | |
#endindex = match2.end(0) | |
title = match2.group(1) #str(vvv)[startindex:endindex] | |
title_list.append(title) | |
else: | |
title_list.append(title) | |
match3 = re.search('\'lengthText\':\s\{\'accessibility\':\s\{\'accessibilityData\':\s\{\'label\':\s\'.*?\'\}\},\s\'simpleText\':\s\'(\d+?:\d+?|\d+?:\d+?:\d+?)\'\},',str(vvv)) | |
durations = "" | |
if match3 != None: | |
durations = match3.group(1) | |
durations_list.append(durations) | |
else: | |
durations_list.append(durations) | |
del(match3,match2,match1) | |
# match4 = re.search('\{[\'|\"]url[\'|\"]:\s[\'|\"]\/channel\/(.+?)[\'|\"],',str(vvv)) | |
# channel_id = "" | |
# if match4 != None: | |
# channel_id = match4.group(1) | |
# channelid_list.append(channel_id) | |
# print(len(videoid_list),video_id) | |
# print(len(title_list),title) | |
# print(len(durations_list),durations) | |
## print(len(channelid_list),channel_id) | |
ziped = zip(videoid_list,title_list,durations_list) | |
#for ind,x in enumerate(ziped): | |
# print (ind,x) | |
# print () | |
# check values | |
search_list_temporary = [x for x in ziped if x[0] != '' and x[1] != '' and x[2] != '' ] | |
del(ziped) | |
######## ##### ###### ###### ###### ###### ###### ###### | |
# | |
#for ind,p in enumerate(search_list_temporary): | |
# print(ind, p) | |
#print() | |
# | |
######## ##### ###### ###### ###### ###### ###### ###### | |
# check keywords in video title | |
search_list = [ x for x in search_list_temporary if re.search(rf'(?i){words}',x[1]) ] | |
del(search_list_temporary) | |
#for ind,p in enumerate(search_list): | |
# print(ind, p) | |
#print() | |
# | |
######## ##### ###### ###### ###### ###### ###### ###### | |
video_list_0 = [] | |
already_list = [] | |
total_list = [] | |
id_cell_1 = {} | |
id_cell_2 = {} | |
video_list_0.extend(search_list) | |
def first_gether(words, target_url, total_list, id_cell_1, id_cell_2): | |
id_cell_seconds = {} | |
id_cell_title = {} | |
ommit = target_url.replace("https://www.youtube.com/watch?v=",'') | |
html = urllib.request.urlopen(target_url).read() | |
html_strings = html.decode() | |
del(html) | |
soup = BeautifulSoup(html_strings, 'html.parser') | |
for ind,script_tag in enumerate(soup.find_all('script')): | |
if ind == 40: | |
json_strings = str(script_tag)[58:-10] | |
del(soup) | |
j_dict = json.loads(json_strings) | |
videoid_list = [] | |
title_list = [] | |
durations_list = [] | |
channelid_list = [] | |
for k,v in j_dict.items(): | |
# if k == 'currentVideoEndpoint': | |
# continue | |
# print('currentVideoEndpoint:',v) | |
# if k == 'trackingParams': | |
# continue | |
# print('trackingParams:',v) | |
# if k == 'onResponseReceivedEndpoints': | |
# continue | |
# print('onResponseReceivedEndpoints:',v) | |
# if k == 'engagementPanels': | |
# continue | |
# print('engagementPanels:',v) | |
# if k == 'topbar': | |
# continue | |
# print('topbar:',v) | |
if k == 'contents': | |
for kk,vv in v.items(): | |
for kkk,vvv in vv.items(): | |
if kkk == 'results': | |
continue | |
if kkk == 'secondaryResults': | |
match1 = re.findall('\'videoId\':\s\'(.+?)\',',str(vvv)) | |
video_id = "" | |
if match1 != None: | |
videoid_list.extend(sorted(list(set(match1)),key=match1.index)) | |
videoid_list.remove(ommit) | |
else: | |
videoid_list.append(video_id) | |
match2 = re.findall('\'title\':\s\{\'accessibility\':\s\{\'accessibilityData\':\s\{\'label\':\s[\'|\"](.*?)[\'|\"]\}\},', str(vvv)) | |
title = "" | |
if match2 != None: | |
title_list.extend(list(match2)) | |
else: | |
title_list.append(title) | |
del(match2,match1) | |
# print(len(videoid_list)) | |
# for i,v in enumerate(videoid_list): | |
# print(i,v) | |
# print() | |
# print(len(title_list)) | |
# for i,v in enumerate(title_list): | |
# print(i,v) | |
# print() | |
ziped2 = zip(videoid_list,title_list) | |
ziped2 = [ x for x in ziped2 if re.search(rf'(?i){words}',x[1])] | |
# for i,v in enumerate(ziped2): | |
# print(i,v) | |
video_list = [] | |
id_data = [x[0] for x in ziped2] | |
video_list.extend(id_data) | |
### video_ids ################################# | |
# video_ids = re.findall(r"watch\?v=(\S{11})", html_strings) | |
# | |
# omitlist = [] | |
# if video_ids is not None: | |
# offset = 0 | |
# for ind,idcode in enumerate(video_ids): | |
# mmmm = re.search(rf'{idcode}',html_strings[offset:-1]) | |
# endindex = mmmm.end(0) | |
# maybetitle = html_strings[endindex:(endindex + 1400)] | |
# #print(ind,idcode) | |
# #print(maybetitle) | |
# kakawari = re.search(rf'(?i){words}',maybetitle) | |
# if kakawari is None: | |
# omitlist.append(video_ids[ind]) | |
# | |
# if len(omitlist) > 0: | |
# for omit in omitlist: | |
# video_ids = [ x for x in video_ids if x != omit ] | |
# | |
# del(omitlist) | |
# id_data = list(set(video_ids)) | |
# del(video_ids) | |
# | |
# video_list.extend(id_data) | |
# video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data] | |
# del(id_data) | |
################################# | |
title = re.search(r"(?<=\<title\>).*?(?=\<\/title\>)",html_strings) | |
if title is None: | |
print("skip") | |
else: | |
title_strings = title.group() | |
kakawari = re.search(rf'(?i){words}',title_strings) | |
if kakawari is None: | |
print() | |
print(title_strings) | |
print("skip (not relevant)") | |
print() | |
pass | |
else: | |
total_list.append(target_url) | |
idxxx = target_url.replace("https://www.youtube.com/watch\?v=","") | |
id_cell_title[idxxx] = title_strings | |
print() | |
print('!!!',title_strings) | |
print('@@@',target_url) | |
keyword2 = re.search(r'"channelId" content=',html_strings) | |
if keyword2 is None: | |
pass | |
else: | |
channelid = keyword2.end(0) | |
print("channelId",html_strings[channelid:(channelid + 26)]) | |
id_cell_1[idxxx] = html_strings[channelid:(channelid + 26)] | |
keyword3 = re.search(r'"datePublished" content=',html_strings) | |
if keyword3 is None: | |
pass | |
else: | |
published = keyword3.end(0) | |
print("datePublished",html_strings[published:(published + 12)]) | |
keyword4 = re.search(r'"uploadDate" content=',html_strings) | |
if keyword4 is None: | |
pass | |
else: | |
uploaded = keyword4.end(0) | |
print("uploadDate",html_strings[uploaded:(uploaded + 12)]) | |
id_cell_2[idxxx] = html_strings[uploaded:(uploaded + 12)] | |
keywords5 = re.search(r'"lengthSeconds":"(\d+)",',html_strings) | |
if keywords5 is None: | |
pass | |
else: | |
seconds = keywords5.group(1) | |
print("seconds",seconds) | |
id_cell_seconds[idxxx] = seconds | |
# for ind,p in enumerate(video_list): | |
# print(ind,p) | |
return video_list, total_list, id_cell_1, id_cell_2, id_cell_seconds, id_cell_title | |
def second_gether(words, x, video_list_0, already_list, total_list, id_cell_1, id_cell_2): | |
url_list = [ "https://www.youtube.com/watch?v=" + str(x[0]) for x in video_list_0 ] | |
video_list_0.clear() | |
if len(url_list) > 0: | |
sum_list = [] | |
rem = [] | |
for i,url in enumerate(url_list): | |
if not re.match(r'^http',url): | |
# if not re.match(r'^@',url): | |
continue | |
# url = url.replace('@ ',"https://www.youtube.com/watch\?v=") | |
videoid = url.replace("https://www.youtube.com/watch\?v=","") | |
if videoid in id_cell_1: | |
print("----------------") | |
print(x,';',(i + 1),';',"skip ") | |
print("----------------") | |
continue | |
if url in already_list: | |
print() | |
print(x,';',(i + 1),';',"skip ") | |
print(" (not relevant)") | |
rem.append(url) | |
continue | |
# print("----------------") | |
# print(x,';',(i + 1),';',url) | |
try: | |
video_list, total_list, id_cell_1, id_cell_2, id_cell_seconds, id_cell_title = first_gether(words, url, total_list, id_cell_1, id_cell_2) | |
except: | |
time.sleep(1) | |
continue | |
already_list.append(url) | |
# l o g | |
### ## ## ## ## ## ## ## ## ## ## ## ## ## ## | |
if videoid in id_cell_1 and videoid in id_cell_2: | |
with open('log.txt', 'a') as out: | |
try: | |
out.write( "vi_id: " + videoid + " up_id: " + id_cell_2[videoid] + " ch_id: " + id_cell_1[videoid] + " second: " + id_cell_seconds[videoid] + " title: " + id_cell_title[videoid] + "\n") | |
except: | |
print("file write error") | |
out.close() | |
del(id_cell_seconds, id_cell_title) # from first_gether function return values | |
### ## ## ## ## ## ## ## ## ## ## ## ## ## ## | |
if ( len(video_list) > 0 ): | |
# video_list = list(set(video_list)) | |
sum_list.extend(video_list) | |
total_list = list(set(total_list)) | |
if ( len(rem) > 0 ): | |
for remove in rem: | |
if remove in sum_list: | |
inum = sum_list.index(remove) | |
sum_list.pop(inum) | |
new_list = sorted(list(set(sum_list)),key=sum_list.index) | |
tempo_list_ = [ "" for x in new_list] | |
ziped = zip(new_list,tempo_list_,tempo_list_) | |
video_list_0.extend(ziped) | |
# for ind,p in enumerate(video_list_0): | |
# print(ind,p) | |
x = x + 1 | |
if ( x < 4000 and len(new_list) > 0 ): | |
video_list, already_list, total_list, id_cell_1, id_cell_2 = second_gether(words, x, video_list_0, already_list, total_list, id_cell_1, id_cell_2) | |
return video_list,already_list,total_list,id_cell_1,id_cell_2 | |
# top level | |
counter_x = 0 | |
video_list_0, already_list, total_list, id_cell_1, id_cell_2 = second_gether(words, counter_x, video_list_0, already_list, total_list, id_cell_1, id_cell_2) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment