Skip to content

Instantly share code, notes, and snippets.

@simonlindgren
Last active March 15, 2018 14:58
Show Gist options
  • Save simonlindgren/6c620eef5ca25d434523adc49ca5bb4a to your computer and use it in GitHub Desktop.
Save simonlindgren/6c620eef5ca25d434523adc49ca5bb4a to your computer and use it in GitHub Desktop.
#!/usr/local/bin/python3
'''
FACEBOOK GROUP DATA DOWNLOADER
Put this file (fbg.py) in a directory alongside a file (groups.csv)
with two comma separated columns [group,id].
Add your personal FB credentials to this file (fbg.py, line 25-26).
Run fbg.py with Python in Terminal:
$ python fbg.py
'''
# Import and setup
import os
import re
import glob
import pandas as pd
from fb_scrape_public import fb_scrape_public as fsp
import shutil
import time
import xlsxwriter
import csv
# FB credentials
client_id = ''
client_secret = ''
# Read the file
groups = pd.DataFrame.from_csv("groups.csv", index_col= None)
# Cleanup data folders
try:
shutil.rmtree('posts_data')
shutil.rmtree('comments_data')
except: pass
# Get everything
grps = [g.replace('/', '_') for g in groups.group.tolist()]
for group, id in zip(grps, groups.id.tolist()):
print("Processing " + group)
# Reset data folders
try:
shutil.rmtree('posts_data')
shutil.rmtree('comments_data')
except:
pass
# Create posts data dir
os.makedirs('posts_data')
# Get the posts
comments = fsp.scrape_fb(client_id, client_secret, str(id)) # use fb_scrape_public
filename = str(id) + "posts.csv"
os.rename("fb_data.csv", "posts_data/" + filename) # save the file to the data directory
post_file = glob.glob("posts_data/*.csv")
posts_df = pd.read_csv(post_file[0])
posts_df = posts_df.replace({'\n|\t|\r': ' '}, regex=True) # remove linebreaks and tabs in the dataframe
# Add a copy of the message column for future sorting purposes
posts_df['post_sorting'] = posts_df['message']
# Get the post ids from the posts dataframe to be able to scrape the comments
post_ids = posts_df.iloc[:, 16].tolist()
# Create comments data dir
os.makedirs('comments_data')
# Get the comments
for count, id in enumerate(post_ids):
print("Getting comments to post " + str(count+1) + "/" + str(len(post_ids)) + " (" + id + ")")
try:
comments = fsp.scrape_fb(client_id, client_secret, id, scrape_mode="comments") # use fb_scrape_public
filename = str(id) + "_scraped.csv"
os.rename("fb_data.csv", "comments_data/"+filename) # save the file to the data directory
except Exception as e:
time.sleep(2)
# Create a comments dataframe
all_comments_files = glob.glob("comments_data/*.csv")
# Make a list of dataframes and add a column with group_id
df_from_each_file = []
for count, f in enumerate(all_comments_files):
f2 = f[14:-12]
f2= f2.split('_')
f2 = f2[0]
df = pd.read_csv(f)
df['group_id'] = f2
df_from_each_file.append(df)
print("Reading file " + str(count+1) + "/" + str(len(all_comments_files)))
if len(df_from_each_file) > 0:
print("Joining files ...")
comments_df = pd.concat(df_from_each_file, ignore_index=True)
print("Done!")
# Rename the 'original_message' column for future sorting purposes
comments_df.rename(columns={'original_message': 'post_sorting'}, inplace=True)
# Keep only some of the columns in comments_df
comments_df = comments_df.loc[:,['from', 'comment','created_time','group_id', 'post_sorting']]
# Add a column 'Type' to the comments_df
comments_df['type']='comment'
# Rename column 'comment' to 'message'
comments_df.rename(columns={'comment': 'message'}, inplace=True)
# Keep only some of the columns in posts_df
posts_df = posts_df.loc[:,['from', 'message','type','created_time', 'group_id', 'post_sorting']]
# Reorder columns to match order of comments_df
posts_df = posts_df[['from', 'message', 'created_time', 'group_id','post_sorting','type']]
fb_groups = pd.concat([posts_df, comments_df], ignore_index=True)
# Reorder columns
fb_groups = fb_groups[['group_id', 'post_sorting', 'created_time', 'type', 'from','message']]
# Rename column 'created_time' to 'time'
fb_groups.rename(columns={'created_time': 'time'}, inplace=True)
# Clean up 'time' column
fb_groups['time'] = fb_groups['time'].str.replace('-','')
fb_groups['time'] = fb_groups['time'].str.replace('T','_')
fb_groups['time'] = fb_groups['time'].str.replace(':','')
fb_groups['time'] = fb_groups['time'].str.replace('\+0000','')
fb_groups['time'] = fb_groups['time'].str.replace('^20','')
# Sort first by thread id, then by date
fb_groups = fb_groups.sort_values(['group_id','time','post_sorting'], ascending=True)
# Replace linebreaks and tabs in the dataframe
fb_groups = fb_groups.replace({'\n|\r|\t': ' '}, regex=True)
# Sort first by thread id, then by date
fb_groups = fb_groups.sort_values(['post_sorting','time'])
file_name = "FBgroup_" + group + ".csv"
# Save to file
fb_groups.to_csv(file_name, index = False)
print("File " + file_name + " saved.\n")
else:
print("Unable to get data from this group.")
pass
try:
shutil.rmtree('posts_data')
shutil.rmtree('comments_data')
except: pass
# When done, also create xlsx versions of all csvs
csvs = glob.glob("FBgroup*")
for csvfile in csvs:
workbook = xlsxwriter.Workbook(csvfile[:-4] + ".xlsx") # create a workbook
worksheet = workbook.add_worksheet("DATA") # add and name a worksheet
with open(csvfile, "r") as input_csv:
table = csv.reader(input_csv)
i = 0
for row in table:
worksheet.write_row(i, 0, row)
i += 1
workbook.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment