Skip to content

Instantly share code, notes, and snippets.

@ahgood
Created June 13, 2016 02:29
Show Gist options
  • Save ahgood/ff268a5a6e615fd43f5f8ac872a0159f to your computer and use it in GitHub Desktop.
Save ahgood/ff268a5a6e615fd43f5f8ac872a0159f to your computer and use it in GitHub Desktop.
4 scripts in 1 file, break them before use
/*
1. Sort lines
2. Check each line, if start with string "http", add to "_http_list.txt", other add to "_no_http_list.txt"
3. If one line contains 2 URLs, keep the first one only.
*/
var _http_list = '';
var _no_http_list = '';
var fs = require("fs");
var urls = fs.readFileSync('input.txt').toString().split('\n');
var sorted_urls = [];
urls.forEach(function (line) {
if (line.indexOf('http') !== -1) {
line = 'http' + line.split('http')[1];
}
sorted_urls.push(line);
});
sorted_urls.sort();
sorted_urls.forEach(function (line) {
if (line.substring(0, 4) === 'http') {
_http_list += line + '\n';
} else {
_no_http_list += line + '\n';
}
});
fs.writeFile("_http_list.txt", _http_list, function(err) {
if(err) return console.log(err);
console.log("The _http_list was saved!");
});
fs.writeFile("_http_list.txt", _no_http_list, function(err) {
if(err) return console.log(err);
console.log("The _no_http_list was saved!");
});
/*
Remove duplicated URL like these:
http://www.ibm.com/en_US/
http://www.ibm.com/en_US
*/
var result = '';
var fs = require("fs");
var urls = fs.readFileSync('input.txt').toString().split('\n');
urls.forEach(function (line) {
if (urls.indexOf(line + '/') === -1) {
result += line + '\n';
}
});
fs.writeFile("output.txt", result, function(err) {
if(err) return console.log(err);
console.log("Done.");
});
/*
Seperate AP+GCG and JP URL
*/
var URLs_JP = '';
var URLs_AP_GCG = '';
var fs = require("fs");
var urls = fs.readFileSync('input.txt').toString().split('\n');
urls.forEach(function (line) {
if (line.indexOf('/jp') !== -1 || line.indexOf('/ja') !== -1) {
URLs_JP += line + '\n';
} else {
URLs_AP_GCG += line + '\n';
}
});
fs.writeFile("URLs_JP.txt", URLs_JP, function(err) {
if(err) return console.log(err);
console.log("The URLs_JP was saved!");
});
fs.writeFile("URLs_AP_GCG.txt", URLs_AP_GCG, function(err) {
if(err) return console.log(err);
console.log("The URLs_AP_GCG was saved!");
});
/*
Check each line, if start with string "http", add to "_http_list.txt", other add to "_no_http_list.txt"
*/
var fs = require("fs");
var urls = fs.readFileSync("input.csv").toString().split('\n');
var _http_list = '';
var _no_http_list = '';
urls.forEach(function(item){
if (item.substring(0, 4) === 'http') {
_http_list += item + '\n';
} else {
_no_http_list += item + '\n';
}
});
fs.write('_http_list.txt', _http_list, 'w');
fs.write('_no_http_list.txt', _no_http_list, 'w');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment