Created
April 29, 2017 13:48
-
-
Save anonymous/ac5accfa1a500dc65fcd23d0a508aa90 to your computer and use it in GitHub Desktop.
Make YouTube playlist from posts in Facebook group
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import re | |
from more_itertools import unique_everseen | |
with open("urls.json", "r") as fp: | |
urls = json.load(fp) | |
# lots of regex to extract video ID from different types of YouTube links | |
def get_id(url): | |
if "youtube" in url: | |
if "attribution" in url: | |
try: | |
video_id = re.search("%3D(.*)\%26", url).group(1) | |
except Exception: | |
print(url) | |
video_id = "" | |
else: | |
try: | |
video_id = re.search("(\?|\&)(v|ci)=(.*)", url).group(3) | |
except Exception: | |
print(url) | |
video_id = "" | |
else: | |
try: | |
video_id = re.search("\.be\/(.*)", url).group(1) | |
except Exception: | |
print(url) | |
video_id = "" | |
return video_id | |
# get video IDs from URLs, then deduplicate | |
ids = list(map(get_id, urls)) | |
ids = list(filter(lambda x: len(x) > 0, ids)) | |
ids = list(unique_everseen(ids)) | |
# access token can be acquired from https://developers.google.com/oauthplayground/ | |
access_token = "YOUR_ACCESS_TOKEN" | |
# your YouTube playlist ID can be found in the playlist URL | |
playlist_id = "YOUR_PLAYLIST_ID" | |
# correct API path | |
path = "https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&access_token={access_token}".format(access_token=access_token) | |
headers={'Content-Type': 'application/json'} | |
# finally insert the video into your playlist | |
for video_id in ids: | |
payload = { | |
"snippet": { | |
"playlistId": playlist_id, | |
"resourceId": { | |
"videoId": video_id, | |
"kind": "youtube#video" | |
} | |
} | |
} | |
request = requests.post(path, data=json.dumps(payload), headers=headers) | |
# for confirmation of success | |
print(request.content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import json | |
# set required likes here | |
required_likes = 5 | |
all_post_data = [] | |
filename = "all_group_post_data.json" | |
with open(filename, "r", encoding="utf-8") as fp: | |
all_post_data += json.load(fp) | |
one_dimension_data = [] | |
# reshape json into 1D for use with Pandas | |
for post in all_post_data: | |
post_data = {} | |
try: | |
post_data["poster"] = post["from"]["name"] | |
except KeyError: | |
post_data["poster"] = "" | |
try: | |
post_data["message"] = post["message"] | |
except KeyError: | |
post_data["message"] = "" | |
try: | |
post_data["name"] = post["name"] | |
except KeyError: | |
post_data["name"] = "" | |
try: | |
post_data["link"] = post["link"] | |
except KeyError: | |
post_data["link"] = "" | |
try: | |
post_data["likes"] = len(post["likes"]["data"]) | |
except KeyError: | |
post_data["likes"] = 0 | |
try: | |
post_data["url"] = post["permalink_url"] | |
except KeyError: | |
post_data["url"] = "" | |
try: | |
post_data["time"] = post["created_time"] | |
except KeyError: | |
post_data["time"] = "" | |
one_dimension_data.append(post_data) | |
# create dataframe for easy filtering and manipulation | |
df = pd.DataFrame(one_dimension_data) | |
# all link fields must contain 'youtu' | |
youtube_df = df[df["link"].str.contains("youtu")] | |
# all posts must have required_likes or more | |
likes_youtube_df = youtube_df[youtube_df["likes"] >= required_likes] | |
urls_series = likes_youtube_df["link"] | |
urls = list(urls_series) | |
#serialise YouTube URLs | |
with open("urls.json", "w") as fp: | |
json.dump(urls, fp) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
# Facebook API path, group_id appears in group URL | |
# Personal access token can be acquired here: https://developers.facebook.com/tools/accesstoken/ | |
path = "https://graph.facebook.com/v2.9/" | |
group_id = "YOUR_GROUP_ID" | |
group_posts_path = "{group_id}/feed/?fields=id".format(group_id=group_id) | |
access_token = "YOUR_ACCESS_TOKEN" | |
group_url = "{path}{group_posts_path}&access_token={access_token}".format( | |
path=path, | |
group_posts_path=group_posts_path, | |
access_token=access_token) | |
url_array = [] | |
# go through all group pagination to get all URLs to scrape | |
def recurse_pages(url): | |
global url_array | |
response = requests.get(url) | |
json_content = response.json() | |
print("Getting next url after {url}".format(url=url)) | |
if len(json_content["data"]) > 0: | |
next_url = json_content["paging"]["next"] | |
url_array.append(next_url) | |
recurse_pages(next_url) | |
recurse_pages(group_url) | |
# serialise URLs to scrape | |
with open("all_group_urls.json", "w") as fp: | |
json.dump(url_array, fp) | |
all_post_ids = [] | |
# scrape all urls for post IDs | |
def get_ids(url): | |
print("\nGetting IDs for {}".format(url)) | |
post_ids = [] | |
response = requests.get(url) | |
json_content = response.json() | |
data = json_content["data"] | |
if len(data) > 0: | |
for post in data: | |
post_id = post["id"] | |
post_ids.append(post_id) | |
return post_ids | |
for url in url_array: | |
post_ids = get_ids(url) | |
all_post_ids += post_ids | |
# serialise post IDs | |
with open("all_group_post_ids.json", "w") as fp: | |
json.dump(all_post_ids, fp) | |
all_post_data = [] | |
# define fields to get from posts | |
fields = "link,story,message,created_time,from,id,likes,name,permalink_url" | |
# get all post data from all above IDs | |
def get_post_data(post_id): | |
url = "https://graph.facebook.com/{post_id}?fields={fields}&access_token={access_token}".format( | |
post_id=post_id, | |
fields=fields, | |
access_token=access_token) | |
print("\nGetting data from {url}".format(url=url)) | |
response = requests.get(url) | |
data = response.json() | |
return data | |
for post_id in all_post_ids: | |
all_post_data.append(get_post_data(post_id)) | |
# serialise post data | |
with open("all_group_post_data.json", "w") as fp: | |
json.dump(all_post_data, fp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment