Skip to content

Instantly share code, notes, and snippets.

@dauuricus
Last active November 8, 2022 08:40
Show Gist options
  • Save dauuricus/c36b7a225154f289b0e39048f521a9ff to your computer and use it in GitHub Desktop.
Save dauuricus/c36b7a225154f289b0e39048f521a9ff to your computer and use it in GitHub Desktop.
youtube search
# coding: UTF-8
import urllib.request
import urllib.parse
import re
import time
words = "ransomware"
#words = "ひろゆき"
keywords = urllib.parse.quote(words)
target = "https://www.youtube.com/results?search_query=" + keywords
video_list_0 = []
already_list = []
total_list = []
id_cell_1 = {}
id_cell_2 = {}
video_list_0.append(target)
#with open("hiroyuki_yt_urls.list") as f:
#with open("hiroyuki_yt_5_id.list") as f:
# video_list_0 = [s.strip() for s in f.readlines()]
def first_gether(target_url,total_list,id_cell_1,id_cell_2):
html = urllib.request.urlopen(target_url).read()
html_strings = html.decode()
del(html)
video_list = []
title = re.search(r"(?<=\<title\>).*?(?=\<\/title\>)",html_strings)
if title is None:
kakawari = re.search(r'(?i)ransomware',html_strings)
#kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',html_strings)
if kakawari is None:
print("skip (not relevant)")
else:
omitlist = []
video_ids = re.findall(r"watch\?v=(.{11})", html_strings)
if len(video_ids) > 0:
for ind,idcode in enumerate(video_ids):
mmmm = re.search(rf'{idcode}',html_strings)
if mmmm is not None:
endindex = mmmm.end(0)
maybetitle = html_strings[endindex:(endindex + 1500)]
kakawari = re.search(r'(?i)ransomware',maybetitle)
#kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',maybetitle)
if kakawari is None:
omitlist.append(video_ids[ind])
if len(omitlist) > 0:
for omit in omitlist:
video_ids = [ x for x in video_ids if x != omit ]
del(omitlist)
id_data = list(set(video_ids))
del(video_ids)
video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data]
del(id_data)
print("skip")
else:
title_strings = title.group()
kakawari = re.search(r'(?i)ransomware',title_strings)
#kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',title_strings)
if kakawari is None:
print("skip (not relevant)")
pass
else:
total_list.append(target_url)
idxxx = target_url.replace("https://www.youtube.com/watch\?v=","")
print()
print(idxxx)
print('@@@',title_strings)
print(target_url)
keyword2 = re.search(r'"channelId" content="',html_strings)
if keyword2 is None:
pass
else:
channelid = keyword2.end(0)
print("channelId",html_strings[channelid:(channelid + 24)])
id_cell_1[idxxx] = html_strings[channelid:(channelid + 24)]
keyword3 = re.search(r'"datePublished" content="',html_strings)
if keyword3 is None:
pass
else:
published = keyword3.end(0)
print("datePublished",html_strings[published:(published + 10)])
id_cell_2[idxxx] = html_strings[published:(published + 10)]
keyword4 = re.search(r'"uploadDate" content="',html_strings)
if keyword4 is None:
pass
else:
uploaded = keyword4.end(0)
print("uploadDate",html_strings[uploaded:(uploaded + 10)])
omitlist = []
video_ids = re.findall(r"watch\?v=(.{11})", html_strings)
if len(video_ids) > 0:
for ind,idcode in enumerate(video_ids):
mmmm = re.search(rf'{idcode}',html_strings)
if mmmm is not None:
endindex = mmmm.end(0)
maybetitle = html_strings[endindex:(endindex + 1500)]
kakawari = re.search(r'(?i)ransomware',maybetitle)
#kakawari = re.search(r'ひろゆき|hiroyuki|西村博之',maybetitle)
if kakawari is None:
omitlist.append(video_ids[ind])
if len(omitlist) > 0:
temp = []
for omit in omitlist:
for x in video_ids:
if (x != omit):
temp.append(x)
video_ids = temp
#video_ids = [ x for x in video_ids if x != omit ]
del(omitlist)
id_data = list(set(video_ids))
del(video_ids)
print(len(id_data))
video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data]
del(id_data)
return video_list,total_list,id_cell_1,id_cell_2
def second_gether(x,url_list,already_list,total_list,id_cell_1,id_cell_2):
if len(url_list) > 0:
sum_list = []
rem = []
id_cell_list = []
b_flag = True
for i,url in enumerate(url_list):
if not re.match(r'^http',url):
# if not re.match(r'^@',url):
# continue
url = "https://www.youtube.com/watch\?v=" + url
print()
print("line number", i + 1)
print(url)
# videoid = url.replace('@ ',"")
videoid = url.replace("https://www.youtube.com/watch\?v=","")
# url = url.replace('@ ',"https://www.youtube.com/watch\?v=")
if videoid in id_cell_1:
print(x,';',(i + 1),';',"skip ")
continue
if url in already_list:
#print(x,';',(i + 1),';',"skip ")
print("skip (not relevant)")
rem.append(url)
continue
#print(x,';',(i + 1),';',url)
try:
video_list,total_list,id_cell_1,id_cell_2 = first_gether(url,total_list,id_cell_1,id_cell_2)
except:
time.sleep(1)
continue
already_list.append(url)
# write file
if (videoid in id_cell_1) and (videoid in id_cell_2):
id_cell_list.append([(videoid,id_cell_1[videoid],id_cell_2[videoid])])
with open("log.txt",mode="a") as out:
try:
out.write("vi_id:" + videoid + " ch_id:" + id_cell_1[videoid] + " pu_da:" + str(id_cell_2[videoid]) + "\n")
except:
print("write error")
out.close()
print()
for xxx, id_cell_data in enumerate(id_cell_list):
print(i, xxx + 1, *id_cell_data)
if ( len(video_list) > 0 ):
video_list = list(set(video_list))
sum_list.extend(video_list)
if i == 30000 :
b_flag = False
break
total_list = list(set(total_list))
if ( len(rem) > 0 ):
for remove in rem:
if remove in sum_list:
inum = sum_list.index(remove)
sum_list.pop(inum)
new_list = sorted(list(set(sum_list)),key=sum_list.index)
x = x + 1
if ( x < 30 and b_flag ):
video_list,already_list,total_list,id_cell_1,id_cell_2 = second_gether(x,new_list,already_list,total_list,id_cell_1,id_cell_2)
return video_list,already_list,total_list,id_cell_1,id_cell_2
counter_x = 0
video_list_0,already_list,total_list,id_cell_1,id_cell_2 = second_gether(counter_x,video_list_0,already_list,total_list,id_cell_1,id_cell_2)
del(video_list_0,already_list)
###for ind,allurl in enumerate(total_list):
### print(ind,allurl)
#with open("hiroyuki.txt","w") as out:
#
# counter_i = 0
# for k,v in id_cell_1.items():
# counter_i = counter_i + 1
# out.write(str(counter_i) + " vi_id: " + k + "\n")
# out.write(str(counter_i) + " pu_da: " + id_cell_2[k] + "\n")
# out.write(str(counter_i) + " ch_id: " + v + "\n")
# out.close()
exit()
################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment