Created
September 24, 2016 09:58
-
-
Save theraot/e743751f0b4c8817e4991d09fd8b794a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.IO; | |
using System.Net; | |
using Theraot.Core; | |
namespace AWDC | |
{ | |
internal class Program | |
{ | |
private static void Main() | |
{ | |
Console.WriteLine("Welcome to Automatic Web Data Collector by Theraot"); | |
string pageUri; | |
string uriPrefix; | |
string file; | |
int fields; | |
do | |
{ | |
Console.WriteLine("Enter the URI of the web page to explore: "); | |
pageUri = Console.ReadLine(); | |
} while (string.IsNullOrEmpty(pageUri)); | |
do | |
{ | |
Console.WriteLine("Enter URI prefix to explore: "); | |
uriPrefix = Console.ReadLine(); | |
} while (string.IsNullOrEmpty(uriPrefix)); | |
do | |
{ | |
Console.WriteLine("Enter file to write to: "); | |
file = Console.ReadLine(); | |
} while (string.IsNullOrEmpty(file)); | |
do | |
{ | |
Console.WriteLine("How many data fields to read: "); | |
} while (!int.TryParse(Console.ReadLine(), out fields)); | |
var fieldPrefixes = new string[fields]; | |
var fieldPostfixes = new string[fields]; | |
for (var index = 0; index < fields; index++) | |
{ | |
do | |
{ | |
Console.WriteLine($"Enter data field prefix #{index + 1}: "); | |
fieldPrefixes[index] = Console.ReadLine(); | |
} while (fieldPrefixes[index] == null); | |
Console.WriteLine($"Enter data field postfix #{index + 1}: "); | |
fieldPostfixes[index] = Console.ReadLine(); | |
if (StringHelper.IsNullOrWhiteSpace(fieldPostfixes[index])) | |
{ | |
fieldPostfixes[index] = " "; | |
} | |
} | |
Work(pageUri, uriPrefix, fields, fieldPrefixes, fieldPostfixes, file); | |
} | |
private static void Work(string pageUri, string uriPrefix, int fields, string[] fieldPrefixes, string[] fieldPostfixes, string file) | |
{ | |
using (var client = new WebClient()) | |
{ | |
using (var fileWriter = new StreamWriter(file)) | |
{ | |
var pageString = client.DownloadString(pageUri); | |
var mainProcessor = new StringProcessor(pageString); | |
while (true) | |
{ | |
if (mainProcessor.ReadUntilAfter("href") == null) | |
{ | |
break; | |
} | |
mainProcessor.ReadWhile(char.IsWhiteSpace); | |
if (!mainProcessor.Read("=")) | |
{ | |
continue; | |
} | |
mainProcessor.ReadWhile(char.IsWhiteSpace); | |
string href; | |
if (mainProcessor.Read('"')) | |
{ | |
href = mainProcessor.ReadWhile(c => c != '"'); | |
mainProcessor.Read('"'); | |
} | |
else | |
{ | |
href = mainProcessor.ReadWhile(c => c != ' '); | |
} | |
mainProcessor.ReadWhile(char.IsWhiteSpace); | |
if (href == null || !href.StartsWith(uriPrefix)) | |
{ | |
continue; | |
} | |
var dataPage = client.DownloadString(href); | |
var writer = fileWriter; | |
ProcessPage(fields, dataPage, fieldPrefixes, fieldPostfixes, data => | |
{ | |
foreach (var s in data) | |
{ | |
writer.Write(s); | |
writer.Write('\t'); | |
} | |
writer.WriteLine(); | |
writer.Flush(); | |
Console.Write("."); | |
}); | |
} | |
} | |
} | |
} | |
private static void ProcessPage(int fields, string dataPage, string[] fieldPrefixes, string[] fieldPostfixes, Action<string[]> add) | |
{ | |
var data = new string[fields]; | |
var dataProcessor = new StringProcessor(dataPage); | |
for (var index = 0; index < fields; index++) | |
{ | |
dataProcessor.ReadUntilAfter(fieldPrefixes[index]); | |
SkipTags(dataProcessor); | |
data[index] = dataProcessor.ReadUntil(fieldPostfixes[index]); | |
} | |
add(data); | |
} | |
private static void SkipTags(StringProcessor dataProcessor) | |
{ | |
while (dataProcessor.Read("<")) | |
{ | |
dataProcessor.ReadUntilAfter(">"); | |
dataProcessor.ReadWhile(char.IsWhiteSpace); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment