Last active
October 30, 2022 07:43
-
-
Save dauuricus/dc59dc537b2ee6164bf7ee1a18f61472 to your computer and use it in GitHub Desktop.
youtube search python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: UTF-8 | |
import urllib.request | |
import urllib.parse | |
import re | |
import time | |
words = "ひろゆき" | |
keywords = urllib.parse.quote(words) | |
target = "https://www.youtube.com/results?search_query=" + keywords | |
video_list_0 = [] | |
already_list = [] | |
total_list = [] | |
id_cell_1 = {} | |
id_cell_2 = {} | |
video_list_0.append(target) | |
#with open("hiroyuki_yt_urls.list") as f: | |
#with open("hiroyuki_yt_5_id.list") as f: | |
# video_list_0 = [s.strip() for s in f.readlines()] | |
def first_gether(target_url,total_list,id_cell_1,id_cell_2): | |
html = urllib.request.urlopen(target_url).read() | |
html_strings = html.decode() | |
del(html) | |
video_list = [] | |
title = re.search(r"(?<=\<title\>).*?(?=\<\/title\>)",html_strings) | |
if title is None: | |
kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',html_strings) | |
if kakawari is None: | |
print("skip (not relevant)") | |
else: | |
omitlist = [] | |
video_ids = re.findall(r"watch\?v=(...........)", html_strings) | |
if len(video_ids) > 0: | |
for ind,idcode in enumerate(video_ids): | |
mmmm = re.search(rf'{idcode}',html_strings) | |
if mmmm is not None: | |
endindex = mmmm.end(0) | |
maybetitle = html_strings[endindex:(endindex + 1500)] | |
kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',maybetitle) | |
if kakawari is None: | |
omitlist.append(video_ids[ind]) | |
if len(omitlist) > 0: | |
for omit in omitlist: | |
video_ids = [ x for x in video_ids if x != omit ] | |
del(omitlist) | |
id_data = list(set(video_ids)) | |
del(video_ids) | |
video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data] | |
del(id_data) | |
print("skip") | |
else: | |
title_strings = title.group() | |
kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',title_strings) | |
if kakawari is None: | |
print("skip (not relevant)") | |
pass | |
else: | |
total_list.append(target_url) | |
idxxx = target_url.replace("https://www.youtube.com/watch\?v=","") | |
print() | |
print(idxxx) | |
print('@@@',title_strings) | |
print(target_url) | |
keyword2 = re.search(r'"channelId" content="',html_strings) | |
if keyword2 is None: | |
pass | |
else: | |
channelid = keyword2.end(0) | |
print("channelId",html_strings[channelid:(channelid + 24)]) | |
id_cell_1[idxxx] = html_strings[channelid:(channelid + 24)] | |
keyword3 = re.search(r'"datePublished" content="',html_strings) | |
if keyword3 is None: | |
pass | |
else: | |
published = keyword3.end(0) | |
print("datePublished",html_strings[published:(published + 10)]) | |
id_cell_2[idxxx] = html_strings[published:(published + 10)] | |
keyword4 = re.search(r'"uploadDate" content="',html_strings) | |
if keyword4 is None: | |
pass | |
else: | |
uploaded = keyword4.end(0) | |
print("uploadDate",html_strings[uploaded:(uploaded + 10)]) | |
omitlist = [] | |
video_ids = re.findall(r"watch\?v=(...........)", html_strings) | |
if len(video_ids) > 0: | |
for ind,idcode in enumerate(video_ids): | |
mmmm = re.search(rf'{idcode}',html_strings) | |
if mmmm is not None: | |
endindex = mmmm.end(0) | |
maybetitle = html_strings[endindex:(endindex + 1500)] | |
kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',maybetitle) | |
if kakawari is None: | |
omitlist.append(video_ids[ind]) | |
if len(omitlist) > 0: | |
temp = [] | |
for omit in omitlist: | |
for x in video_ids: | |
if (x != omit): | |
temp.append(x) | |
video_ids = temp | |
#video_ids = [ x for x in video_ids if x != omit ] | |
del(omitlist) | |
id_data = list(set(video_ids)) | |
del(video_ids) | |
print(len(id_data)) | |
video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data] | |
del(id_data) | |
return video_list,total_list,id_cell_1,id_cell_2 | |
def second_gether(x,url_list,already_list,total_list,id_cell_1,id_cell_2): | |
if len(url_list) > 0: | |
sum_list = [] | |
rem = [] | |
id_cell_list = [] | |
b_flag = True | |
for i,url in enumerate(url_list): | |
if not re.match(r'^http',url): | |
# if not re.match(r'^@',url): | |
# continue | |
url = "https://www.youtube.com/watch\?v=" + url | |
print() | |
print(i + 1) | |
print(url) | |
# videoid = url.replace('@ ',"") | |
videoid = url.replace("https://www.youtube.com/watch\?v=","") | |
# url = url.replace('@ ',"https://www.youtube.com/watch\?v=") | |
if videoid in id_cell_1: | |
print(x,';',(i + 1),';',"skip ") | |
continue | |
if url in already_list: | |
#print(x,';',(i + 1),';',"skip ") | |
print("skip (not relevant)") | |
rem.append(url) | |
continue | |
#print(x,';',(i + 1),';',url) | |
try: | |
video_list,total_list,id_cell_1,id_cell_2 = first_gether(url,total_list,id_cell_1,id_cell_2) | |
except: | |
time.sleep(1) | |
continue | |
time.sleep(1) | |
already_list.append(url) | |
# write file | |
if (videoid in id_cell_1) and (videoid in id_cell_2): | |
id_cell_list.append([(videoid,id_cell_1[videoid],id_cell_2[videoid])]) | |
with open("hiro_yt1.txt",mode="a") as out: | |
try: | |
out.write("vi_id:" + videoid + " ch_id:" + id_cell_1[videoid] + " pu_da:" + str(id_cell_2[videoid]) + "\n") | |
except: | |
print("write error") | |
out.close() | |
print() | |
for xxx, id_cell_data in enumerate(id_cell_list): | |
print(i, xxx + 1, *id_cell_data) | |
if ( len(video_list) > 0 ): | |
video_list = list(set(video_list)) | |
sum_list.extend(video_list) | |
if i == 30000 : | |
b_flag = False | |
break | |
total_list = list(set(total_list)) | |
if ( len(rem) > 0 ): | |
for remove in rem: | |
if remove in sum_list: | |
inum = sum_list.index(remove) | |
sum_list.pop(inum) | |
new_list = sorted(list(set(sum_list)),key=sum_list.index) | |
x = x + 1 | |
if ( x < 300 and b_flag ): | |
video_list,already_list,total_list,id_cell_1,id_cell_2 = second_gether(x,new_list,already_list,total_list,id_cell_1,id_cell_2) | |
return video_list,already_list,total_list,id_cell_1,id_cell_2 | |
counter_x = 0 | |
video_list_0,already_list,total_list,id_cell_1,id_cell_2 = second_gether(counter_x,video_list_0,already_list,total_list,id_cell_1,id_cell_2) | |
del(video_list_0,already_list) | |
###for ind,allurl in enumerate(total_list): | |
### print(ind,allurl) | |
#with open("hiroyuki.txt","w") as out: | |
# | |
# counter_i = 0 | |
# for k,v in id_cell_1.items(): | |
# counter_i = counter_i + 1 | |
# out.write(str(counter_i) + " vi_id: " + k + "\n") | |
# out.write(str(counter_i) + " pu_da: " + id_cell_2[k] + "\n") | |
# out.write(str(counter_i) + " ch_id: " + v + "\n") | |
# out.close() | |
exit() | |
################################################ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment