dauuricus · November 14, 2022 13:29
diff --git a/youtube_search.py b/youtube_search.py
 # coding: UTF-8
 import urllib.request
 import urllib.parse
 import re
 import time
 import json
 from bs4 import BeautifulSoup

 words = "ransomware"
 keywords = urllib.parse.quote(words)
 target_url = "https://www.youtube.com/results?search_query=" + keywords
 html = urllib.request.urlopen(target_url).read()
 html_strings = html.decode()
 del(html)

 soup = BeautifulSoup(html_strings, 'html.parser')
 for ind,script_tag in enumerate(soup.find_all('script')):
    if ind == 33:
        json_strings = str(script_tag)[58:-10]
 del(soup)

 j_dict = json.loads(json_strings)

 #print(json.dumps(j_dict, indent=2))

 jj_dict = {}

 for k,v in j_dict.items():
    if k == "contents":
        for kk,vv in v.items():
            for kkk,vvv in vv.items():
                if kkk == 'primaryContents':
                    for kkkk,vvvv in vvv.items():
                        for kkkkk,vvvvv in vvvv.items():
                            if kkkkk == 'contents':
                                jj_dict = vvvvv[0]
                                #print(vvvvv[1])

 videoid_list = []
 title_list = []
 durations_list = []
 channelid_list = []

 for k,v in jj_dict.items():
    for kk,vv in v.items():
        if kk == 'contents':
            for ind,vvv in enumerate(vv):
                if(ind == len(vv)-1):
                    print(ind)
                    print()
                else:
                    print(ind,end=", ")

                match1 = re.search('videoId',str(vvv))
                video_id = ""
                if match1 != None:
                    startindex = match1.start(0)
                    endindex = match1.end(0)
                    video_id = str(vvv)[(startindex + 11):(endindex + 15)]
                    videoid_list.append(video_id)
                else:
                    videoid_list.append(video_id)

                match2 = re.search('\'title\':\s\{\'runs\':\s\[\{\'text\':\s[\'|\"](.*?)[\'|\"]\}\],',str(vvv))
                title = ""
                if match2 != None:
                    #startindex = match2.start(0)
                    #endindex = match2.end(0)
                    title = match2.group(1) #str(vvv)[startindex:endindex]
                    title_list.append(title)
                else:
                    title_list.append(title)

                match3 = re.search('\'lengthText\':\s\{\'accessibility\':\s\{\'accessibilityData\':\s\{\'label\':\s\'.*?\'\}\},\s\'simpleText\':\s\'(\d+?:\d+?|\d+?:\d+?:\d+?)\'\},',str(vvv))
                durations = ""
                if match3 != None:
                    durations = match3.group(1)
                    durations_list.append(durations)
                else:
                    durations_list.append(durations)

                del(match3,match2,match1)


 #                match4 = re.search('\{[\'|\"]url[\'|\"]:\s[\'|\"]\/channel\/(.+?)[\'|\"],',str(vvv))
 #                channel_id = ""
 #                if match4 != None:
 #                    channel_id = match4.group(1)
 #                    channelid_list.append(channel_id)

 #                print(len(videoid_list),video_id)
 #                print(len(title_list),title)
 #                print(len(durations_list),durations)
 ##                print(len(channelid_list),channel_id)

 ziped = zip(videoid_list,title_list,durations_list)

 #for ind,x in enumerate(ziped):
 #    print (ind,x)
 #    print ()

 # check values 
 search_list_temporary = [x for x in ziped if x[0] != '' and x[1] != '' and x[2] != '' ]
 del(ziped)

 ######## ##### ###### ###### ###### ###### ###### ###### 
 #
 #for ind,p in enumerate(search_list_temporary):
 #    print(ind, p)
 #print()
 #
 ######## ##### ###### ###### ###### ###### ###### ###### 

 # check keywords in video title 
 search_list = [ x for x in search_list_temporary if re.search(rf'(?i){words}',x[1]) ]
 del(search_list_temporary)

 #for ind,p in enumerate(search_list):
 #    print(ind, p)
 #print()
 #
 ######## ##### ###### ###### ###### ###### ###### ###### 

 video_list_0 = []
 already_list = []
 total_list = []
 id_cell_1 = {}
 id_cell_2 = {}

 video_list_0.extend(search_list)

 def first_gether(words, target_url, total_list, id_cell_1, id_cell_2):

    id_cell_seconds = {}
    id_cell_title = {}

    ommit = target_url.replace("https://www.youtube.com/watch?v=",'')

    html = urllib.request.urlopen(target_url).read()
    html_strings = html.decode()
    del(html)
    soup = BeautifulSoup(html_strings, 'html.parser')
    for ind,script_tag in enumerate(soup.find_all('script')):
        if ind == 40:
            json_strings = str(script_tag)[58:-10]

    del(soup)

    j_dict = json.loads(json_strings)

    videoid_list = []
    title_list = []
    durations_list = []
    channelid_list = []

    for k,v in j_dict.items():
 #        if k == 'currentVideoEndpoint':
 #            continue
 #            print('currentVideoEndpoint:',v)
 #        if k == 'trackingParams':
 #            continue
 #            print('trackingParams:',v)
 #        if k == 'onResponseReceivedEndpoints':
 #            continue
 #            print('onResponseReceivedEndpoints:',v)
 #        if k == 'engagementPanels':
 #            continue
 #            print('engagementPanels:',v)
 #        if k == 'topbar':
 #            continue
 #            print('topbar:',v)

        if k == 'contents':
            for kk,vv in v.items():
                for kkk,vvv in vv.items():
                    if kkk == 'results':
                        continue
                    if kkk == 'secondaryResults':
                        match1 = re.findall('\'videoId\':\s\'(.+?)\',',str(vvv))
                        video_id = ""
                        if match1 != None:
                            videoid_list.extend(sorted(list(set(match1)),key=match1.index))
                            videoid_list.remove(ommit)
                        else:
                            videoid_list.append(video_id)

                        match2 = re.findall('\'title\':\s\{\'accessibility\':\s\{\'accessibilityData\':\s\{\'label\':\s[\'|\"](.*?)[\'|\"]\}\},', str(vvv))
                        title = ""
                        if match2 != None:
                            title_list.extend(list(match2))
                        else:
                            title_list.append(title)

                        del(match2,match1)

 #                        print(len(videoid_list))
 #                        for i,v in enumerate(videoid_list):
 #                            print(i,v)
 #                            print()
 #                        print(len(title_list))
 #                        for i,v in enumerate(title_list):
 #                            print(i,v)
 #                            print()

                        ziped2 = zip(videoid_list,title_list)
                        ziped2 = [ x for x in ziped2 if re.search(rf'(?i){words}',x[1])]
 #                        for i,v in enumerate(ziped2):
 #                            print(i,v)


    video_list = []
    id_data = [x[0] for x in ziped2]
    video_list.extend(id_data)

 ### video_ids #################################
 #    video_ids = re.findall(r"watch\?v=(\S{11})", html_strings)
 #
 #    omitlist = []
 #    if video_ids is not None:
 #        offset = 0
 #        for ind,idcode in enumerate(video_ids):
 #            mmmm = re.search(rf'{idcode}',html_strings[offset:-1])
 #            endindex = mmmm.end(0)
 #            maybetitle = html_strings[endindex:(endindex + 1400)]
 #            #print(ind,idcode)
 #            #print(maybetitle)
 #            kakawari = re.search(rf'(?i){words}',maybetitle)
 #            if kakawari is None:
 #                omitlist.append(video_ids[ind])
 #
 #        if len(omitlist) > 0:
 #            for omit in omitlist:
 #                video_ids = [ x for x in video_ids if x != omit ]
 #    
 #    del(omitlist)
 #    id_data = list(set(video_ids))
 #    del(video_ids)
 #
 #    video_list.extend(id_data)
 #    video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data]
 #    del(id_data)
 #################################

    title = re.search(r"(?<=\<title\>).*?(?=\<\/title\>)",html_strings)
    if title is None:
        print("skip")
    else:
        title_strings = title.group()

        kakawari = re.search(rf'(?i){words}',title_strings)

        if kakawari is None:
            print()
            print(title_strings)
            print("skip (not relevant)")
            print()
            pass
        else:         
            total_list.append(target_url)
            idxxx = target_url.replace("https://www.youtube.com/watch\?v=","")
            id_cell_title[idxxx] = title_strings

            print()
            print('!!!',title_strings)
            print('@@@',target_url)
            keyword2 = re.search(r'"channelId" content=',html_strings)
            if keyword2 is None:
                pass
            else:
                channelid = keyword2.end(0)
                print("channelId",html_strings[channelid:(channelid + 26)])
                id_cell_1[idxxx] = html_strings[channelid:(channelid + 26)] 
            keyword3 = re.search(r'"datePublished" content=',html_strings)
            if keyword3 is None:
                pass
            else:
                published = keyword3.end(0)
                print("datePublished",html_strings[published:(published + 12)])
            keyword4 = re.search(r'"uploadDate" content=',html_strings)
            if keyword4 is None:
                pass
            else:
                uploaded = keyword4.end(0)
                print("uploadDate",html_strings[uploaded:(uploaded + 12)])
                id_cell_2[idxxx] = html_strings[uploaded:(uploaded + 12)]
            keywords5 = re.search(r'"lengthSeconds":"(\d+)",',html_strings)
            if keywords5 is None:
                pass
            else:
                seconds = keywords5.group(1)
                print("seconds",seconds)
                id_cell_seconds[idxxx] = seconds

 #    for ind,p in enumerate(video_list):
 #        print(ind,p)

    return video_list, total_list, id_cell_1, id_cell_2, id_cell_seconds, id_cell_title


 def second_gether(words, x, video_list_0, already_list, total_list, id_cell_1, id_cell_2):

    url_list = [ "https://www.youtube.com/watch?v=" + str(x[0]) for x in video_list_0 ]
    video_list_0.clear()

    if len(url_list) > 0:
        sum_list = []
        rem = []
        for i,url in enumerate(url_list):
            if not re.match(r'^http',url):
    #        if not re.match(r'^@',url):
                continue
    #        url = url.replace('@ ',"https://www.youtube.com/watch\?v=")

            videoid = url.replace("https://www.youtube.com/watch\?v=","")
            if videoid in id_cell_1:
                print("----------------")
                print(x,';',(i + 1),';',"skip  ")
                print("----------------")
                continue

            if url in already_list:
                print()
                print(x,';',(i + 1),';',"skip  ")
                print(" (not relevant)")
                rem.append(url)
                continue

 #            print("----------------")
 #            print(x,';',(i + 1),';',url)
            try:
                video_list, total_list, id_cell_1, id_cell_2, id_cell_seconds, id_cell_title = first_gether(words, url, total_list, id_cell_1, id_cell_2)
            except:
                time.sleep(1)
                continue

            already_list.append(url) 

 #  l o g
 ###  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##              
            if videoid in id_cell_1 and videoid in id_cell_2:
                with open('log.txt', 'a') as out:
                    try:
                        out.write( "vi_id: " + videoid + " up_id: " + id_cell_2[videoid] + " ch_id: " + id_cell_1[videoid] + " second: " + id_cell_seconds[videoid] + " title: " + id_cell_title[videoid] + "\n")
                    except:
                        print("file write error")
                    out.close()
            del(id_cell_seconds, id_cell_title) # from first_gether function return values

 ###  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##              

            if ( len(video_list) > 0 ):
 #                video_list = list(set(video_list))
                sum_list.extend(video_list)

        total_list = list(set(total_list))
        if ( len(rem) > 0 ):
            for remove in rem:
                if remove in sum_list:
                    inum = sum_list.index(remove)
                    sum_list.pop(inum)

        new_list = sorted(list(set(sum_list)),key=sum_list.index)
        tempo_list_ = [ "" for x in new_list]
        ziped = zip(new_list,tempo_list_,tempo_list_)

        video_list_0.extend(ziped)
 #        for ind,p in enumerate(video_list_0):
 #            print(ind,p)

        x = x + 1

        if ( x < 4000 and len(new_list) > 0 ):
            video_list, already_list, total_list, id_cell_1, id_cell_2 = second_gether(words, x, video_list_0, already_list, total_list, id_cell_1, id_cell_2)

    return video_list,already_list,total_list,id_cell_1,id_cell_2

 # top level
 counter_x = 0

 video_list_0, already_list, total_list, id_cell_1, id_cell_2 = second_gether(words, counter_x, video_list_0, already_list, total_list, id_cell_1, id_cell_2)
	# coding: UTF-8
	import urllib.request
	import urllib.parse
	import re
	import time
	import json
	from bs4 import BeautifulSoup

	words = "ransomware"
	keywords = urllib.parse.quote(words)
	target_url = "https://www.youtube.com/results?search_query=" + keywords
	html = urllib.request.urlopen(target_url).read()
	html_strings = html.decode()
	del(html)

	soup = BeautifulSoup(html_strings, 'html.parser')
	for ind,script_tag in enumerate(soup.find_all('script')):
	if ind == 33:
	json_strings = str(script_tag)[58:-10]
	del(soup)

	j_dict = json.loads(json_strings)

	#print(json.dumps(j_dict, indent=2))

	jj_dict = {}

	for k,v in j_dict.items():
	if k == "contents":
	for kk,vv in v.items():
	for kkk,vvv in vv.items():
	if kkk == 'primaryContents':
	for kkkk,vvvv in vvv.items():
	for kkkkk,vvvvv in vvvv.items():
	if kkkkk == 'contents':
	jj_dict = vvvvv[0]
	#print(vvvvv[1])

	videoid_list = []
	title_list = []
	durations_list = []
	channelid_list = []

	for k,v in jj_dict.items():
	for kk,vv in v.items():
	if kk == 'contents':
	for ind,vvv in enumerate(vv):
	if(ind == len(vv)-1):
	print(ind)
	print()
	else:
	print(ind,end=", ")

	match1 = re.search('videoId',str(vvv))
	video_id = ""
	if match1 != None:
	startindex = match1.start(0)
	endindex = match1.end(0)
	video_id = str(vvv)[(startindex + 11):(endindex + 15)]
	videoid_list.append(video_id)
	else:
	videoid_list.append(video_id)

	match2 = re.search('\'title\':\s\{\'runs\':\s\[\{\'text\':\s[\'\|\"](.*?)[\'\|\"]\}\],',str(vvv))
	title = ""
	if match2 != None:
	#startindex = match2.start(0)
	#endindex = match2.end(0)
	title = match2.group(1) #str(vvv)[startindex:endindex]
	title_list.append(title)
	else:
	title_list.append(title)

	match3 = re.search('\'lengthText\':\s\{\'accessibility\':\s\{\'accessibilityData\':\s\{\'label\':\s\'.*?\'\}\},\s\'simpleText\':\s\'(\d+?:\d+?\|\d+?:\d+?:\d+?)\'\},',str(vvv))
	durations = ""
	if match3 != None:
	durations = match3.group(1)
	durations_list.append(durations)
	else:
	durations_list.append(durations)

	del(match3,match2,match1)


	# match4 = re.search('\{[\'\|\"]url[\'\|\"]:\s[\'\|\"]\/channel\/(.+?)[\'\|\"],',str(vvv))
	# channel_id = ""
	# if match4 != None:
	# channel_id = match4.group(1)
	# channelid_list.append(channel_id)

	# print(len(videoid_list),video_id)
	# print(len(title_list),title)
	# print(len(durations_list),durations)
	## print(len(channelid_list),channel_id)

	ziped = zip(videoid_list,title_list,durations_list)

	#for ind,x in enumerate(ziped):
	# print (ind,x)
	# print ()

	# check values
	search_list_temporary = [x for x in ziped if x[0] != '' and x[1] != '' and x[2] != '' ]
	del(ziped)

	######## ##### ###### ###### ###### ###### ###### ######
	#
	#for ind,p in enumerate(search_list_temporary):
	# print(ind, p)
	#print()
	#
	######## ##### ###### ###### ###### ###### ###### ######

	# check keywords in video title
	search_list = [ x for x in search_list_temporary if re.search(rf'(?i){words}',x[1]) ]
	del(search_list_temporary)

	#for ind,p in enumerate(search_list):
	# print(ind, p)
	#print()
	#
	######## ##### ###### ###### ###### ###### ###### ######

	video_list_0 = []
	already_list = []
	total_list = []
	id_cell_1 = {}
	id_cell_2 = {}

	video_list_0.extend(search_list)

	def first_gether(words, target_url, total_list, id_cell_1, id_cell_2):

	id_cell_seconds = {}
	id_cell_title = {}

	ommit = target_url.replace("https://www.youtube.com/watch?v=",'')

	html = urllib.request.urlopen(target_url).read()
	html_strings = html.decode()
	del(html)
	soup = BeautifulSoup(html_strings, 'html.parser')
	for ind,script_tag in enumerate(soup.find_all('script')):
	if ind == 40:
	json_strings = str(script_tag)[58:-10]

	del(soup)

	j_dict = json.loads(json_strings)

	videoid_list = []
	title_list = []
	durations_list = []
	channelid_list = []

	for k,v in j_dict.items():
	# if k == 'currentVideoEndpoint':
	# continue
	# print('currentVideoEndpoint:',v)
	# if k == 'trackingParams':
	# continue
	# print('trackingParams:',v)
	# if k == 'onResponseReceivedEndpoints':
	# continue
	# print('onResponseReceivedEndpoints:',v)
	# if k == 'engagementPanels':
	# continue
	# print('engagementPanels:',v)
	# if k == 'topbar':
	# continue
	# print('topbar:',v)

	if k == 'contents':
	for kk,vv in v.items():
	for kkk,vvv in vv.items():
	if kkk == 'results':
	continue
	if kkk == 'secondaryResults':
	match1 = re.findall('\'videoId\':\s\'(.+?)\',',str(vvv))
	video_id = ""
	if match1 != None:
	videoid_list.extend(sorted(list(set(match1)),key=match1.index))
	videoid_list.remove(ommit)
	else:
	videoid_list.append(video_id)

	match2 = re.findall('\'title\':\s\{\'accessibility\':\s\{\'accessibilityData\':\s\{\'label\':\s[\'\|\"](.*?)[\'\|\"]\}\},', str(vvv))
	title = ""
	if match2 != None:
	title_list.extend(list(match2))
	else:
	title_list.append(title)

	del(match2,match1)

	# print(len(videoid_list))
	# for i,v in enumerate(videoid_list):
	# print(i,v)
	# print()
	# print(len(title_list))
	# for i,v in enumerate(title_list):
	# print(i,v)
	# print()

	ziped2 = zip(videoid_list,title_list)
	ziped2 = [ x for x in ziped2 if re.search(rf'(?i){words}',x[1])]
	# for i,v in enumerate(ziped2):
	# print(i,v)


	video_list = []
	id_data = [x[0] for x in ziped2]
	video_list.extend(id_data)

	### video_ids #################################
	# video_ids = re.findall(r"watch\?v=(\S{11})", html_strings)
	#
	# omitlist = []
	# if video_ids is not None:
	# offset = 0
	# for ind,idcode in enumerate(video_ids):
	# mmmm = re.search(rf'{idcode}',html_strings[offset:-1])
	# endindex = mmmm.end(0)
	# maybetitle = html_strings[endindex:(endindex + 1400)]
	# #print(ind,idcode)
	# #print(maybetitle)
	# kakawari = re.search(rf'(?i){words}',maybetitle)
	# if kakawari is None:
	# omitlist.append(video_ids[ind])
	#
	# if len(omitlist) > 0:
	# for omit in omitlist:
	# video_ids = [ x for x in video_ids if x != omit ]
	#
	# del(omitlist)
	# id_data = list(set(video_ids))
	# del(video_ids)
	#
	# video_list.extend(id_data)
	# video_list = ["https://www.youtube.com/watch\?v=" + str(x) for x in id_data]
	# del(id_data)
	#################################

	title = re.search(r"(?<=\<title\>).*?(?=\<\/title\>)",html_strings)
	if title is None:
	print("skip")
	else:
	title_strings = title.group()

	kakawari = re.search(rf'(?i){words}',title_strings)

	if kakawari is None:
	print()
	print(title_strings)
	print("skip (not relevant)")
	print()
	pass
	else:
	total_list.append(target_url)
	idxxx = target_url.replace("https://www.youtube.com/watch\?v=","")
	id_cell_title[idxxx] = title_strings

	print()
	print('!!!',title_strings)
	print('@@@',target_url)
	keyword2 = re.search(r'"channelId" content=',html_strings)
	if keyword2 is None:
	pass
	else:
	channelid = keyword2.end(0)
	print("channelId",html_strings[channelid:(channelid + 26)])
	id_cell_1[idxxx] = html_strings[channelid:(channelid + 26)]
	keyword3 = re.search(r'"datePublished" content=',html_strings)
	if keyword3 is None:
	pass
	else:
	published = keyword3.end(0)
	print("datePublished",html_strings[published:(published + 12)])
	keyword4 = re.search(r'"uploadDate" content=',html_strings)
	if keyword4 is None:
	pass
	else:
	uploaded = keyword4.end(0)
	print("uploadDate",html_strings[uploaded:(uploaded + 12)])
	id_cell_2[idxxx] = html_strings[uploaded:(uploaded + 12)]
	keywords5 = re.search(r'"lengthSeconds":"(\d+)",',html_strings)
	if keywords5 is None:
	pass
	else:
	seconds = keywords5.group(1)
	print("seconds",seconds)
	id_cell_seconds[idxxx] = seconds

	# for ind,p in enumerate(video_list):
	# print(ind,p)

	return video_list, total_list, id_cell_1, id_cell_2, id_cell_seconds, id_cell_title


	def second_gether(words, x, video_list_0, already_list, total_list, id_cell_1, id_cell_2):

	url_list = [ "https://www.youtube.com/watch?v=" + str(x[0]) for x in video_list_0 ]
	video_list_0.clear()

	if len(url_list) > 0:
	sum_list = []
	rem = []
	for i,url in enumerate(url_list):
	if not re.match(r'^http',url):
	# if not re.match(r'^@',url):
	continue
	# url = url.replace('@ ',"https://www.youtube.com/watch\?v=")

	videoid = url.replace("https://www.youtube.com/watch\?v=","")
	if videoid in id_cell_1:
	print("----------------")
	print(x,';',(i + 1),';',"skip ")
	print("----------------")
	continue

	if url in already_list:
	print()
	print(x,';',(i + 1),';',"skip ")
	print(" (not relevant)")
	rem.append(url)
	continue

	# print("----------------")
	# print(x,';',(i + 1),';',url)
	try:
	video_list, total_list, id_cell_1, id_cell_2, id_cell_seconds, id_cell_title = first_gether(words, url, total_list, id_cell_1, id_cell_2)
	except:
	time.sleep(1)
	continue

	already_list.append(url)

	# l o g
	### ## ## ## ## ## ## ## ## ## ## ## ## ## ##
	if videoid in id_cell_1 and videoid in id_cell_2:
	with open('log.txt', 'a') as out:
	try:
	out.write( "vi_id: " + videoid + " up_id: " + id_cell_2[videoid] + " ch_id: " + id_cell_1[videoid] + " second: " + id_cell_seconds[videoid] + " title: " + id_cell_title[videoid] + "\n")
	except:
	print("file write error")
	out.close()
	del(id_cell_seconds, id_cell_title) # from first_gether function return values

	### ## ## ## ## ## ## ## ## ## ## ## ## ## ##

	if ( len(video_list) > 0 ):
	# video_list = list(set(video_list))
	sum_list.extend(video_list)

	total_list = list(set(total_list))
	if ( len(rem) > 0 ):
	for remove in rem:
	if remove in sum_list:
	inum = sum_list.index(remove)
	sum_list.pop(inum)

	new_list = sorted(list(set(sum_list)),key=sum_list.index)
	tempo_list_ = [ "" for x in new_list]
	ziped = zip(new_list,tempo_list_,tempo_list_)

	video_list_0.extend(ziped)
	# for ind,p in enumerate(video_list_0):
	# print(ind,p)

	x = x + 1

	if ( x < 4000 and len(new_list) > 0 ):
	video_list, already_list, total_list, id_cell_1, id_cell_2 = second_gether(words, x, video_list_0, already_list, total_list, id_cell_1, id_cell_2)

	return video_list,already_list,total_list,id_cell_1,id_cell_2

	# top level
	counter_x = 0

	video_list_0, already_list, total_list, id_cell_1, id_cell_2 = second_gether(words, counter_x, video_list_0, already_list, total_list, id_cell_1, id_cell_2)