-
-
Save rvvvt/1fd9a309939408db511392aacbfe848b to your computer and use it in GitHub Desktop.
scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Diagnostics; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
using System.Net.Http; | |
using HtmlAgilityPack; | |
namespace scraper | |
{ | |
enum AssetType | |
{ | |
Document, | |
Image, | |
Link, | |
Script | |
} | |
class Asset | |
{ | |
public string ParentUrl { get; set; } | |
public string Url { get; set; } | |
public AssetType Type { get; set; } | |
public bool Done { get; set; } | |
public bool Ignored { get; set; } | |
} | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
var s = new Scraper(); | |
s.Scrape("http://192.168.30.1:8080/php/"); | |
while (s.Assets.Any(x => !x.Done && x.Type == AssetType.Document && !x.Ignored)) | |
{ | |
s.CheckForUnscrapedUrls(); | |
} | |
s.PrintAsets(); | |
Console.ReadKey(); | |
} | |
} | |
class Scraper | |
{ | |
private Uri baseUrl = new Uri("http://192.168.30.1:8080/php/"); | |
private Uri baseAddress = new Uri("http://192.168.30.1:8080"); | |
private HttpClient Client; | |
public IList<Asset> Assets { get; set; } | |
private Stopwatch Stopwatch; | |
private int scrapes = 0; | |
public Scraper() | |
{ | |
Assets = new List<Asset>(); | |
Stopwatch = new Stopwatch(); | |
Client = new HttpClient() | |
{ | |
BaseAddress = baseAddress | |
}; | |
} | |
public void Scrape(string target) | |
{ | |
scrapes++; | |
Stopwatch.Start(); | |
if (target.Contains("#")) | |
{ | |
target = target.Split('#')[0]; | |
} | |
if (!TargetIsValid(target)) | |
{ | |
var asset = new Asset() { Url = target, Type = AssetType.Document, Done = true, Ignored = true }; | |
AddToAssetsCollection(asset); | |
return; | |
} | |
string contents = GetTargetContents(target); | |
if (scrapes % 100 == 0) | |
{ | |
Console.WriteLine("."); | |
} | |
var doc = new HtmlDocument(); | |
doc.LoadHtml(contents); | |
var hrefNodes = doc.DocumentNode.SelectNodes("//a[@href]"); | |
if (hrefNodes != null) | |
{ | |
foreach (HtmlNode link in hrefNodes) | |
{ | |
var relative = link.Attributes.FirstOrDefault(x => x.Name == "href").Value; | |
var abs = new Uri(baseUrl, relative); | |
var asset = new Asset() { Url = abs.AbsoluteUri, Type = AssetType.Document, ParentUrl = target }; | |
AddToAssetsCollection(asset); | |
} | |
} | |
var imgNodes = doc.DocumentNode.SelectNodes("//img[@src]"); | |
if (imgNodes != null) | |
{ | |
foreach (HtmlNode link in imgNodes) | |
{ | |
var href = link.Attributes.FirstOrDefault(x => x.Name == "src").Value; | |
var abs = new Uri(baseUrl, href); | |
var asset = new Asset() { Url = abs.AbsoluteUri, Type = AssetType.Image, ParentUrl = target, Done = true }; | |
AddToAssetsCollection(asset); | |
} | |
} | |
var linkNodes = doc.DocumentNode.SelectNodes("//link[@href]"); | |
if (linkNodes != null) | |
{ | |
foreach (HtmlNode link in linkNodes) | |
{ | |
var href = link.Attributes.FirstOrDefault(x => x.Name == "href").Value; | |
var abs = new Uri(baseUrl, href); | |
var asset = new Asset() { Url = abs.AbsoluteUri, Type = AssetType.Link, ParentUrl = target, Done = true }; | |
AddToAssetsCollection(asset); | |
} | |
} | |
var scriptNodes = doc.DocumentNode.SelectNodes("//script[@src]"); | |
if (scriptNodes != null) | |
{ | |
foreach (HtmlNode link in scriptNodes) | |
{ | |
var href = link.Attributes.FirstOrDefault(x => x.Name == "src").Value; | |
var abs = new Uri(baseUrl, href); | |
var asset = new Asset() { Url = abs.AbsoluteUri, Type = AssetType.Script, ParentUrl = target, Done = true }; | |
AddToAssetsCollection(asset); | |
} | |
} | |
} | |
private bool TargetIsValid(string target) | |
{ | |
if (!target.Contains(baseUrl.ToString())) return false; | |
if (target.StartsWith("#")) return false; | |
if (target.StartsWith("mailto:")) return false; | |
if (target.StartsWith("data:")) return false; | |
if (target.StartsWith("/")) return true; | |
return true; | |
} | |
private void AddToAssetsCollection(Asset asset) | |
{ | |
if (!Assets.Any(x => x.Url == asset.Url)) | |
{ | |
Assets.Add(asset); | |
} | |
} | |
public void CheckForUnscrapedUrls() | |
{ | |
var todo = Assets.FirstOrDefault(x => !x.Done && x.Type == AssetType.Document && !x.Ignored); | |
if (todo == null) return; | |
Scrape(todo.Url); | |
todo.Done = true; | |
} | |
private string GetTargetContents(string target) | |
{ | |
var response = Client.GetAsync(target).Result; | |
var contents = response.Content.ReadAsStringAsync().Result; | |
return contents; | |
} | |
internal void PrintAsets() | |
{ | |
var lines = new List<string>(); | |
Stopwatch.Stop(); | |
// foreach doc one | |
foreach (var doc in Assets.Where(x => x.Type == AssetType.Document && !x.Ignored)) | |
{ | |
//Console.WriteLine(doc.Type + " > " + doc.Url); | |
lines.Add(doc.Type + " > " + doc.Url); | |
foreach (var thing in Assets.Where(x => x.ParentUrl == doc.Url && !x.Ignored)) | |
{ | |
//Console.WriteLine(" " + thing.Type + " > " + thing.Url); | |
lines.Add(" " + thing.Type + " > " + thing.Url); | |
} | |
} | |
System.IO.StreamWriter file = new System.IO.StreamWriter(@"C:\Users\SImon\Desktop\test.txt"); | |
foreach (var l in lines) | |
{ | |
file.WriteLine(l); | |
} | |
file.Close(); | |
Console.WriteLine("-------------------"); | |
Console.WriteLine("Scrape took " + Stopwatch.ElapsedMilliseconds + "ms"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment