simonlindgren · March 15, 2018 14:58
diff --git a/fbg.py b/fbg.py
 #!/usr/local/bin/python3
 '''
 FACEBOOK GROUP DATA DOWNLOADER

 Put this file (fbg.py) in a directory alongside a file (groups.csv)
 with two comma separated columns [group,id].

 Add your personal FB credentials to this file (fbg.py, line 25-26).

 Run fbg.py with Python in Terminal:
 $ python fbg.py
 '''
 # Import and setup
 import os
 import re
 import glob
 import pandas as pd
 from fb_scrape_public import fb_scrape_public as fsp
 import shutil
 import time
 import xlsxwriter
 import csv

 # FB credentials
 client_id = ''
 client_secret = ''

 # Read the file
 groups = pd.DataFrame.from_csv("groups.csv", index_col= None)

 # Cleanup data folders
 try:
    shutil.rmtree('posts_data')
    shutil.rmtree('comments_data')
 except: pass

 # Get everything
 grps = [g.replace('/', '_') for g in groups.group.tolist()]

 for group, id in zip(grps, groups.id.tolist()):

    print("Processing " + group)

    # Reset data folders
    try:
        shutil.rmtree('posts_data')
        shutil.rmtree('comments_data')
    except:
        pass

    # Create posts data dir
    os.makedirs('posts_data')

    # Get the posts
    comments = fsp.scrape_fb(client_id, client_secret, str(id))  # use fb_scrape_public
    filename = str(id) + "posts.csv"
    os.rename("fb_data.csv", "posts_data/" + filename)  # save the file to the data directory
    post_file = glob.glob("posts_data/*.csv")
    posts_df = pd.read_csv(post_file[0])
    posts_df = posts_df.replace({'\n|\t|\r': ' '}, regex=True) # remove linebreaks and tabs in the dataframe

    # Add a copy of the message column for future sorting purposes
    posts_df['post_sorting'] = posts_df['message']

    # Get the post ids from the posts dataframe to be able to scrape the comments
    post_ids = posts_df.iloc[:, 16].tolist()

    # Create comments data dir
    os.makedirs('comments_data')

    # Get the comments
    for count, id in enumerate(post_ids):
        print("Getting comments to post " + str(count+1) + "/" + str(len(post_ids)) + " (" + id + ")")

        try:
            comments = fsp.scrape_fb(client_id, client_secret, id, scrape_mode="comments")  # use fb_scrape_public
            filename = str(id) + "_scraped.csv"
            os.rename("fb_data.csv", "comments_data/"+filename)  # save the file to the data directory

        except Exception as e:
            time.sleep(2)

    # Create a comments dataframe
    all_comments_files = glob.glob("comments_data/*.csv")

    # Make a list of dataframes and add a column with group_id
    df_from_each_file = []
    for count, f in enumerate(all_comments_files):
        f2 = f[14:-12]
        f2= f2.split('_')
        f2 = f2[0]
        df = pd.read_csv(f)
        df['group_id'] = f2
        df_from_each_file.append(df)

        print("Reading file " + str(count+1) + "/" + str(len(all_comments_files)))
  
    if len(df_from_each_file) > 0:
        
        print("Joining files ...")
        
        comments_df = pd.concat(df_from_each_file, ignore_index=True)
        print("Done!")

        # Rename the 'original_message' column for future sorting purposes
        comments_df.rename(columns={'original_message': 'post_sorting'}, inplace=True)

        # Keep only some of the columns in comments_df
        comments_df = comments_df.loc[:,['from', 'comment','created_time','group_id', 'post_sorting']]

        # Add a column 'Type' to the comments_df
        comments_df['type']='comment'

        # Rename column 'comment' to 'message'
        comments_df.rename(columns={'comment': 'message'}, inplace=True)

        # Keep only some of the columns in posts_df
        posts_df = posts_df.loc[:,['from', 'message','type','created_time', 'group_id', 'post_sorting']]

        # Reorder columns to match order of comments_df
        posts_df = posts_df[['from', 'message', 'created_time', 'group_id','post_sorting','type']]

        fb_groups = pd.concat([posts_df, comments_df], ignore_index=True)

        # Reorder columns
        fb_groups = fb_groups[['group_id', 'post_sorting', 'created_time', 'type', 'from','message']]

        # Rename column 'created_time' to 'time'
        fb_groups.rename(columns={'created_time': 'time'}, inplace=True)

        # Clean up 'time' column
        fb_groups['time'] = fb_groups['time'].str.replace('-','')
        fb_groups['time'] = fb_groups['time'].str.replace('T','_')
        fb_groups['time'] = fb_groups['time'].str.replace(':','')
        fb_groups['time'] = fb_groups['time'].str.replace('\+0000','')
        fb_groups['time'] = fb_groups['time'].str.replace('^20','')

        # Sort first by thread id, then by date
        fb_groups = fb_groups.sort_values(['group_id','time','post_sorting'], ascending=True)

        # Replace linebreaks and tabs in the dataframe
        fb_groups = fb_groups.replace({'\n|\r|\t': ' '}, regex=True)

        # Sort first by thread id, then by date
        fb_groups = fb_groups.sort_values(['post_sorting','time'])

        file_name = "FBgroup_" + group +  ".csv"

        # Save to file
        fb_groups.to_csv(file_name, index = False)
        print("File " + file_name + " saved.\n")
    
    else:
        print("Unable to get data from this group.")
        pass
        
 try:
    shutil.rmtree('posts_data')
    shutil.rmtree('comments_data')
 except: pass

 # When done, also create xlsx versions of all csvs
 csvs = glob.glob("FBgroup*")

 for csvfile in csvs:
    workbook = xlsxwriter.Workbook(csvfile[:-4] + ".xlsx") # create a workbook
    worksheet = workbook.add_worksheet("DATA") # add and name a worksheet
    
    with open(csvfile, "r") as input_csv:
        table = csv.reader(input_csv)
        i = 0
        
        for row in table:
            worksheet.write_row(i, 0, row)
            i += 1
    workbook.close()
	#!/usr/local/bin/python3
	'''
	FACEBOOK GROUP DATA DOWNLOADER

	Put this file (fbg.py) in a directory alongside a file (groups.csv)
	with two comma separated columns [group,id].

	Add your personal FB credentials to this file (fbg.py, line 25-26).

	Run fbg.py with Python in Terminal:
	$ python fbg.py
	'''
	# Import and setup
	import os
	import re
	import glob
	import pandas as pd
	from fb_scrape_public import fb_scrape_public as fsp
	import shutil
	import time
	import xlsxwriter
	import csv

	# FB credentials
	client_id = ''
	client_secret = ''

	# Read the file
	groups = pd.DataFrame.from_csv("groups.csv", index_col= None)

	# Cleanup data folders
	try:
	shutil.rmtree('posts_data')
	shutil.rmtree('comments_data')
	except: pass

	# Get everything
	grps = [g.replace('/', '_') for g in groups.group.tolist()]

	for group, id in zip(grps, groups.id.tolist()):

	print("Processing " + group)

	# Reset data folders
	try:
	shutil.rmtree('posts_data')
	shutil.rmtree('comments_data')
	except:
	pass

	# Create posts data dir
	os.makedirs('posts_data')

	# Get the posts
	comments = fsp.scrape_fb(client_id, client_secret, str(id)) # use fb_scrape_public
	filename = str(id) + "posts.csv"
	os.rename("fb_data.csv", "posts_data/" + filename) # save the file to the data directory
	post_file = glob.glob("posts_data/*.csv")
	posts_df = pd.read_csv(post_file[0])
	posts_df = posts_df.replace({'\n\|\t\|\r': ' '}, regex=True) # remove linebreaks and tabs in the dataframe

	# Add a copy of the message column for future sorting purposes
	posts_df['post_sorting'] = posts_df['message']

	# Get the post ids from the posts dataframe to be able to scrape the comments
	post_ids = posts_df.iloc[:, 16].tolist()

	# Create comments data dir
	os.makedirs('comments_data')

	# Get the comments
	for count, id in enumerate(post_ids):
	print("Getting comments to post " + str(count+1) + "/" + str(len(post_ids)) + " (" + id + ")")

	try:
	comments = fsp.scrape_fb(client_id, client_secret, id, scrape_mode="comments") # use fb_scrape_public
	filename = str(id) + "_scraped.csv"
	os.rename("fb_data.csv", "comments_data/"+filename) # save the file to the data directory

	except Exception as e:
	time.sleep(2)

	# Create a comments dataframe
	all_comments_files = glob.glob("comments_data/*.csv")

	# Make a list of dataframes and add a column with group_id
	df_from_each_file = []
	for count, f in enumerate(all_comments_files):
	f2 = f[14:-12]
	f2= f2.split('_')
	f2 = f2[0]
	df = pd.read_csv(f)
	df['group_id'] = f2
	df_from_each_file.append(df)

	print("Reading file " + str(count+1) + "/" + str(len(all_comments_files)))

	if len(df_from_each_file) > 0:

	print("Joining files ...")

	comments_df = pd.concat(df_from_each_file, ignore_index=True)
	print("Done!")

	# Rename the 'original_message' column for future sorting purposes
	comments_df.rename(columns={'original_message': 'post_sorting'}, inplace=True)

	# Keep only some of the columns in comments_df
	comments_df = comments_df.loc[:,['from', 'comment','created_time','group_id', 'post_sorting']]

	# Add a column 'Type' to the comments_df
	comments_df['type']='comment'

	# Rename column 'comment' to 'message'
	comments_df.rename(columns={'comment': 'message'}, inplace=True)

	# Keep only some of the columns in posts_df
	posts_df = posts_df.loc[:,['from', 'message','type','created_time', 'group_id', 'post_sorting']]

	# Reorder columns to match order of comments_df
	posts_df = posts_df[['from', 'message', 'created_time', 'group_id','post_sorting','type']]

	fb_groups = pd.concat([posts_df, comments_df], ignore_index=True)

	# Reorder columns
	fb_groups = fb_groups[['group_id', 'post_sorting', 'created_time', 'type', 'from','message']]

	# Rename column 'created_time' to 'time'
	fb_groups.rename(columns={'created_time': 'time'}, inplace=True)

	# Clean up 'time' column
	fb_groups['time'] = fb_groups['time'].str.replace('-','')
	fb_groups['time'] = fb_groups['time'].str.replace('T','_')
	fb_groups['time'] = fb_groups['time'].str.replace(':','')
	fb_groups['time'] = fb_groups['time'].str.replace('\+0000','')
	fb_groups['time'] = fb_groups['time'].str.replace('^20','')

	# Sort first by thread id, then by date
	fb_groups = fb_groups.sort_values(['group_id','time','post_sorting'], ascending=True)

	# Replace linebreaks and tabs in the dataframe
	fb_groups = fb_groups.replace({'\n\|\r\|\t': ' '}, regex=True)

	# Sort first by thread id, then by date
	fb_groups = fb_groups.sort_values(['post_sorting','time'])

	file_name = "FBgroup_" + group + ".csv"

	# Save to file
	fb_groups.to_csv(file_name, index = False)
	print("File " + file_name + " saved.\n")

	else:
	print("Unable to get data from this group.")
	pass

	try:
	shutil.rmtree('posts_data')
	shutil.rmtree('comments_data')
	except: pass

	# When done, also create xlsx versions of all csvs
	csvs = glob.glob("FBgroup*")

	for csvfile in csvs:
	workbook = xlsxwriter.Workbook(csvfile[:-4] + ".xlsx") # create a workbook
	worksheet = workbook.add_worksheet("DATA") # add and name a worksheet

	with open(csvfile, "r") as input_csv:
	table = csv.reader(input_csv)
	i = 0

	for row in table:
	worksheet.write_row(i, 0, row)
	i += 1
	workbook.close()