Last active
December 11, 2015 03:58
-
-
Save illy/4541557 to your computer and use it in GitHub Desktop.
script for crawling and preprocessing tweet data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## This script is for crawling tweets with a specific address file. | |
#!/usr/bin/env bash | |
DIR=PARENT_DIR/`date "+%d-%m-%y-%H:%M"` #set the download file based to download date | |
mkdir -p $DIR #make dir according to above | |
wget -i EXTERNAL_ADDRESS_LIST -np -r -N -l1 -P $DIR | |
################ | |
#use wget: | |
#-i for an external address list | |
#-np for no parent dir | |
#-r recursively | |
#-N turn on time stamp | |
#-l level of dir | |
#-P use dir prefix |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mkdir -p RAW11 #make a dir for store original data | |
mkdir -p CLEAN11 #make a dir for processed data | |
for i in [MRT]*:* #set a loop | |
do | |
DIRNAME=CLEAN11/`echo $i | sed 's/-[0-9][0-9]:[0-9][0-9]//'` #set the name of sub-dir of processed data | |
echo $DIRNAME #print the name above | |
mkdir -p $DIRNAME #set the sub-dir accordingly | |
for j in $i/search.twitter.com/* #set another loop for original dir | |
do | |
NEWFILE=`echo $j | sed 's/^[^=]*=//' | sed 's/\&.*$//'` #name the processed files | |
cat $j | python -m json.tool | #use json lib in python | |
awk 'BEGIN {created_at=""; text = "";} #extract the specific part | |
/"created_at"/{created_at=$0} | |
/"source"/,/"to_user"/{text=$0} | |
{print created_at, text}' | | |
grep '"text"' | | |
sed 's/^[^,]*,//g' | | |
sed 's/\+[0-9]*", *"text": "/ /g' | | |
sed 's/", $//g' | | |
sed 's/\(^ [0-9]*\) Aug \(2012\) /\1-11-\2 /g' | | |
sed 's/\(^ [0-9]*\) Jul \(2012\) /\1-07-\2 /g' >> $DIRNAME/$NEWFILE | |
done | |
mv $i RAW11 #move to the dir for original dir | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment