Created
May 30, 2012 16:20
-
-
Save josefslerka/2837367 to your computer and use it in GitHub Desktop.
Google Refine Template for Scrapping Facebook Fan Pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
{ | |
"op": "core/column-addition-by-fetching-urls", | |
"description": "Create column Facebook at index 2 by fetching URLs based on column URL using expression grel:value", | |
"engineConfig": { | |
"facets": [], | |
"mode": "row-based" | |
}, | |
"newColumnName": "Facebook", | |
"columnInsertIndex": 2, | |
"baseColumnName": "url", | |
"urlExpression": "grel:value", | |
"onError": "set-to-blank", | |
"delay": 5000 | |
}, | |
{ | |
"op": "core/column-split", | |
"description": "Split column Facebook by separator", | |
"engineConfig": { | |
"facets": [], | |
"mode": "record-based" | |
}, | |
"columnName": "Facebook", | |
"guessCellType": true, | |
"removeOriginalColumn": false, | |
"mode": "separator", | |
"separator": "pagesListData", | |
"regex": false, | |
"maxColumns": 0 | |
}, | |
{ | |
"op": "core/column-split", | |
"description": "Split column Facebook 2 by separator", | |
"engineConfig": { | |
"facets": [], | |
"mode": "record-based" | |
}, | |
"columnName": "Facebook 2", | |
"guessCellType": true, | |
"removeOriginalColumn": true, | |
"mode": "separator", | |
"separator": "uiCollapsedListVisible", | |
"regex": false, | |
"maxColumns": 0 | |
}, | |
{ | |
"op": "core/column-addition", | |
"description": "Create column cistestranky at index 5 based on column Facebook 2 1 using expression grel:replace(value,/<\\/?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)\\/?>/,'')", | |
"engineConfig": { | |
"facets": [], | |
"mode": "record-based" | |
}, | |
"newColumnName": "cistestranky", | |
"columnInsertIndex": 5, | |
"baseColumnName": "Facebook 2 1", | |
"expression": "grel:replace(value,/<\\/?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)\\/?>/,'')", | |
"onError": "set-to-blank" | |
}, | |
{ | |
"op": "core/column-split", | |
"description": "Split column cistestranky by separator", | |
"engineConfig": { | |
"facets": [], | |
"mode": "record-based" | |
}, | |
"columnName": "cistestranky", | |
"guessCellType": true, | |
"removeOriginalColumn": false, | |
"mode": "separator", | |
"separator": "<a class=\"infoSeeMore", | |
"regex": false, | |
"maxColumns": 0 | |
}, | |
{ | |
"op": "core/column-split", | |
"description": "Split column cistestranky 1 by separator", | |
"engineConfig": { | |
"facets": [], | |
"mode": "record-based" | |
}, | |
"columnName": "cistestranky 1", | |
"guessCellType": true, | |
"removeOriginalColumn": false, | |
"mode": "separator", | |
"separator": "\">", | |
"regex": false, | |
"maxColumns": 0 | |
}, | |
{ | |
"op": "core/column-removal", | |
"description": "Remove column cistestranky 2", | |
"columnName": "cistestranky 2" | |
}, | |
{ | |
"op": "core/column-removal", | |
"description": "Remove column Facebook 2 2", | |
"columnName": "Facebook 2 2" | |
}, | |
{ | |
"op": "core/column-removal", | |
"description": "Remove column Facebook 2 3", | |
"columnName": "Facebook 2 3" | |
}, | |
{ | |
"op": "core/column-removal", | |
"description": "Remove column cistestranky 1 1", | |
"columnName": "cistestranky 1 1" | |
}, | |
{ | |
"op": "core/column-removal", | |
"description": "Remove column cistestranky 1", | |
"columnName": "cistestranky 1" | |
}, | |
{ | |
"op": "core/column-removal", | |
"description": "Remove column cistestranky", | |
"columnName": "cistestranky" | |
}, | |
{ | |
"op": "core/column-removal", | |
"description": "Remove column Facebook 2 1", | |
"columnName": "Facebook 2 1" | |
}, | |
{ | |
"op": "core/column-removal", | |
"description": "Remove column Facebook 1", | |
"columnName": "Facebook 1" | |
}, | |
{ | |
"op": "core/column-rename", | |
"description": "Rename column cistestranky 1 2 to fanpages", | |
"oldColumnName": "cistestranky 1 2", | |
"newColumnName": "fanpages" | |
}, | |
{ | |
"op": "core/column-move", | |
"description": "Move column Facebook to position 3", | |
"columnName": "Facebook", | |
"index": 3 | |
} | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment