Last active
November 8, 2022 08:40
-
-
Save dauuricus/c36b7a225154f289b0e39048f521a9ff to your computer and use it in GitHub Desktop.
youtube search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: UTF-8 | |
import urllib.request | |
import urllib.parse | |
import re | |
import time | |
words = "ransomware" | |
#words = "ひろゆき" | |
keywords = urllib.parse.quote(words) | |
target = "https://www.youtube.com/results?search_query=" + keywords | |
video_list_0 = [] | |
already_list = [] | |
total_list = [] | |
id_cell_1 = {} | |
id_cell_2 = {} | |
video_list_0.append(target) | |
#with open("hiroyuki_yt_urls.list") as f: | |
#with open("hiroyuki_yt_5_id.list") as f: | |
# video_list_0 = [s.strip() for s in f.readlines()] | |
def first_gether(target_url,total_list,id_cell_1,id_cell_2): | |
html = urllib.request.urlopen(target_url).read() | |
html_strings = html.decode() | |
del(html) | |
video_list = [] | |
title = re.search(r"(?<=\<title\>).*?(?=\<\/title\>)",html_strings) | |
if title is None: | |
kakawari = re.search(r'(?i)ransomware',html_strings) | |
#kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',html_strings) | |
if kakawari is None: | |
print("skip (not relevant)") | |
else: | |
omitlist = [] | |
video_ids = re.findall(r"watch\?v=(.{11})", html_strings) | |
if len(video_ids) > 0: | |
for ind,idcode in enumerate(video_ids): | |
mmmm = re.search(rf'{idcode}',html_strings) | |
if mmmm is not None: | |
endindex = mmmm.end(0) | |
maybetitle = html_strings[endindex:(endindex + 1500)] | |
kakawari = re.search(r'(?i)ransomware',maybetitle) | |
#kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',maybetitle) | |
if kakawari is None: | |
omitlist.append(video_ids[ind]) | |
if len(omitlist) > 0: | |
for omit in omitlist: | |
video_ids = [ x for x in video_ids if x != omit ] | |
del(omitlist) | |
id_data = list(set(video_ids)) | |
del(video_ids) | |
video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data] | |
del(id_data) | |
print("skip") | |
else: | |
title_strings = title.group() | |
kakawari = re.search(r'(?i)ransomware',title_strings) | |
#kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',title_strings) | |
if kakawari is None: | |
print("skip (not relevant)") | |
pass | |
else: | |
total_list.append(target_url) | |
idxxx = target_url.replace("https://www.youtube.com/watch\?v=","") | |
print() | |
print(idxxx) | |
print('@@@',title_strings) | |
print(target_url) | |
keyword2 = re.search(r'"channelId" content="',html_strings) | |
if keyword2 is None: | |
pass | |
else: | |
channelid = keyword2.end(0) | |
print("channelId",html_strings[channelid:(channelid + 24)]) | |
id_cell_1[idxxx] = html_strings[channelid:(channelid + 24)] | |
keyword3 = re.search(r'"datePublished" content="',html_strings) | |
if keyword3 is None: | |
pass | |
else: | |
published = keyword3.end(0) | |
print("datePublished",html_strings[published:(published + 10)]) | |
id_cell_2[idxxx] = html_strings[published:(published + 10)] | |
keyword4 = re.search(r'"uploadDate" content="',html_strings) | |
if keyword4 is None: | |
pass | |
else: | |
uploaded = keyword4.end(0) | |
print("uploadDate",html_strings[uploaded:(uploaded + 10)]) | |
omitlist = [] | |
video_ids = re.findall(r"watch\?v=(.{11})", html_strings) | |
if len(video_ids) > 0: | |
for ind,idcode in enumerate(video_ids): | |
mmmm = re.search(rf'{idcode}',html_strings) | |
if mmmm is not None: | |
endindex = mmmm.end(0) | |
maybetitle = html_strings[endindex:(endindex + 1500)] | |
kakawari = re.search(r'(?i)ransomware',maybetitle) | |
#kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',maybetitle) | |
if kakawari is None: | |
omitlist.append(video_ids[ind]) | |
if len(omitlist) > 0: | |
temp = [] | |
for omit in omitlist: | |
for x in video_ids: | |
if (x != omit): | |
temp.append(x) | |
video_ids = temp | |
#video_ids = [ x for x in video_ids if x != omit ] | |
del(omitlist) | |
id_data = list(set(video_ids)) | |
del(video_ids) | |
print(len(id_data)) | |
video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data] | |
del(id_data) | |
return video_list,total_list,id_cell_1,id_cell_2 | |
def second_gether(x,url_list,already_list,total_list,id_cell_1,id_cell_2): | |
if len(url_list) > 0: | |
sum_list = [] | |
rem = [] | |
id_cell_list = [] | |
b_flag = True | |
for i,url in enumerate(url_list): | |
if not re.match(r'^http',url): | |
# if not re.match(r'^@',url): | |
# continue | |
url = "https://www.youtube.com/watch\?v=" + url | |
print() | |
print("line number", i + 1) | |
print(url) | |
# videoid = url.replace('@ ',"") | |
videoid = url.replace("https://www.youtube.com/watch\?v=","") | |
# url = url.replace('@ ',"https://www.youtube.com/watch\?v=") | |
if videoid in id_cell_1: | |
print(x,';',(i + 1),';',"skip ") | |
continue | |
if url in already_list: | |
#print(x,';',(i + 1),';',"skip ") | |
print("skip (not relevant)") | |
rem.append(url) | |
continue | |
#print(x,';',(i + 1),';',url) | |
try: | |
video_list,total_list,id_cell_1,id_cell_2 = first_gether(url,total_list,id_cell_1,id_cell_2) | |
except: | |
time.sleep(1) | |
continue | |
already_list.append(url) | |
# write file | |
if (videoid in id_cell_1) and (videoid in id_cell_2): | |
id_cell_list.append([(videoid,id_cell_1[videoid],id_cell_2[videoid])]) | |
with open("log.txt",mode="a") as out: | |
try: | |
out.write("vi_id:" + videoid + " ch_id:" + id_cell_1[videoid] + " pu_da:" + str(id_cell_2[videoid]) + "\n") | |
except: | |
print("write error") | |
out.close() | |
print() | |
for xxx, id_cell_data in enumerate(id_cell_list): | |
print(i, xxx + 1, *id_cell_data) | |
if ( len(video_list) > 0 ): | |
video_list = list(set(video_list)) | |
sum_list.extend(video_list) | |
if i == 30000 : | |
b_flag = False | |
break | |
total_list = list(set(total_list)) | |
if ( len(rem) > 0 ): | |
for remove in rem: | |
if remove in sum_list: | |
inum = sum_list.index(remove) | |
sum_list.pop(inum) | |
new_list = sorted(list(set(sum_list)),key=sum_list.index) | |
x = x + 1 | |
if ( x < 30 and b_flag ): | |
video_list,already_list,total_list,id_cell_1,id_cell_2 = second_gether(x,new_list,already_list,total_list,id_cell_1,id_cell_2) | |
return video_list,already_list,total_list,id_cell_1,id_cell_2 | |
counter_x = 0 | |
video_list_0,already_list,total_list,id_cell_1,id_cell_2 = second_gether(counter_x,video_list_0,already_list,total_list,id_cell_1,id_cell_2) | |
del(video_list_0,already_list) | |
###for ind,allurl in enumerate(total_list): | |
### print(ind,allurl) | |
#with open("hiroyuki.txt","w") as out: | |
# | |
# counter_i = 0 | |
# for k,v in id_cell_1.items(): | |
# counter_i = counter_i + 1 | |
# out.write(str(counter_i) + " vi_id: " + k + "\n") | |
# out.write(str(counter_i) + " pu_da: " + id_cell_2[k] + "\n") | |
# out.write(str(counter_i) + " ch_id: " + v + "\n") | |
# out.close() | |
exit() | |
################################################ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment