Created
June 13, 2016 02:29
-
-
Save ahgood/ff268a5a6e615fd43f5f8ac872a0159f to your computer and use it in GitHub Desktop.
4 scripts in 1 file, break them before use
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
1. Sort lines | |
2. Check each line, if start with string "http", add to "_http_list.txt", other add to "_no_http_list.txt" | |
3. If one line contains 2 URLs, keep the first one only. | |
*/ | |
var _http_list = ''; | |
var _no_http_list = ''; | |
var fs = require("fs"); | |
var urls = fs.readFileSync('input.txt').toString().split('\n'); | |
var sorted_urls = []; | |
urls.forEach(function (line) { | |
if (line.indexOf('http') !== -1) { | |
line = 'http' + line.split('http')[1]; | |
} | |
sorted_urls.push(line); | |
}); | |
sorted_urls.sort(); | |
sorted_urls.forEach(function (line) { | |
if (line.substring(0, 4) === 'http') { | |
_http_list += line + '\n'; | |
} else { | |
_no_http_list += line + '\n'; | |
} | |
}); | |
fs.writeFile("_http_list.txt", _http_list, function(err) { | |
if(err) return console.log(err); | |
console.log("The _http_list was saved!"); | |
}); | |
fs.writeFile("_http_list.txt", _no_http_list, function(err) { | |
if(err) return console.log(err); | |
console.log("The _no_http_list was saved!"); | |
}); | |
/* | |
Remove duplicated URL like these: | |
http://www.ibm.com/en_US/ | |
http://www.ibm.com/en_US | |
*/ | |
var result = ''; | |
var fs = require("fs"); | |
var urls = fs.readFileSync('input.txt').toString().split('\n'); | |
urls.forEach(function (line) { | |
if (urls.indexOf(line + '/') === -1) { | |
result += line + '\n'; | |
} | |
}); | |
fs.writeFile("output.txt", result, function(err) { | |
if(err) return console.log(err); | |
console.log("Done."); | |
}); | |
/* | |
Seperate AP+GCG and JP URL | |
*/ | |
var URLs_JP = ''; | |
var URLs_AP_GCG = ''; | |
var fs = require("fs"); | |
var urls = fs.readFileSync('input.txt').toString().split('\n'); | |
urls.forEach(function (line) { | |
if (line.indexOf('/jp') !== -1 || line.indexOf('/ja') !== -1) { | |
URLs_JP += line + '\n'; | |
} else { | |
URLs_AP_GCG += line + '\n'; | |
} | |
}); | |
fs.writeFile("URLs_JP.txt", URLs_JP, function(err) { | |
if(err) return console.log(err); | |
console.log("The URLs_JP was saved!"); | |
}); | |
fs.writeFile("URLs_AP_GCG.txt", URLs_AP_GCG, function(err) { | |
if(err) return console.log(err); | |
console.log("The URLs_AP_GCG was saved!"); | |
}); | |
/* | |
Check each line, if start with string "http", add to "_http_list.txt", other add to "_no_http_list.txt" | |
*/ | |
var fs = require("fs"); | |
var urls = fs.readFileSync("input.csv").toString().split('\n'); | |
var _http_list = ''; | |
var _no_http_list = ''; | |
urls.forEach(function(item){ | |
if (item.substring(0, 4) === 'http') { | |
_http_list += item + '\n'; | |
} else { | |
_no_http_list += item + '\n'; | |
} | |
}); | |
fs.write('_http_list.txt', _http_list, 'w'); | |
fs.write('_no_http_list.txt', _no_http_list, 'w'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment