Skip to content

Instantly share code, notes, and snippets.

@AlbertoMonteiro
Last active December 14, 2015 01:49
Show Gist options
  • Save AlbertoMonteiro/5009212 to your computer and use it in GitHub Desktop.
Save AlbertoMonteiro/5009212 to your computer and use it in GitHub Desktop.
Capturar todos os cnaes do site do governo.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using HtmlAgilityPack;
namespace ConsoleApplication7
{
class Program
{
private static StreamWriter stream;
static void Main(string[] args)
{
var htmlDocument = RequestPage("http://www.cnae.ibge.gov.br/estrutura.asp?TabelaBusca=CNAE_201@CNAE%202.1%20-%20Subclasses@0@cnaefiscal@0");
var linksDasSessoes = ProcuraA(htmlDocument.DocumentNode);
File.Delete("C:/cnaes.txt");
stream = new StreamWriter("C:/cnaes.txt", true);
Parallel.ForEach(linksDasSessoes.Select(x => x.GetAttributeValue("href", "")).Distinct(),
s =>
{
var document = RequestPage("http://www.cnae.ibge.gov.br/" + s);
BuscaLinksGrupos(ProcuraA(document.DocumentNode));
});
}
private static HtmlDocument RequestPage(string url)
{
HtmlDocument htmlDocument;
using (var webClient = new WebClient())
{
htmlDocument = new HtmlDocument();
try
{
htmlDocument.Load(webClient.OpenRead(url));
}
catch (Exception)
{
return new HtmlDocument();
}
}
return htmlDocument;
}
private static void BuscaLinksSubClasses(IEnumerable<HtmlNode> linksDasClasses)
{
Parallel.ForEach(
linksDasClasses.Where(x => x.InnerText.Length == 6).Select(x => x.GetAttributeValue("href", "")).Distinct(),
s =>
{
var document = RequestPage("http://www.cnae.ibge.gov.br/" + s);
foreach (var linksDasSubclass in ProcuraA(document.DocumentNode).Where(x => x.InnerText.Length == 9))
{
var parteLink = linksDasSubclass.GetAttributeValue("href", "");
var subDocument = RequestPage("http://www.cnae.ibge.gov.br/" + parteLink);
var htmlNode = ProcuraB(subDocument.DocumentNode).Last();
var htmlDecode = (htmlNode.InnerText);
lock(stream)
{
stream.WriteLine("{0};{1}", linksDasSubclass.InnerHtml, htmlDecode);
stream.Flush();
}
Console.WriteLine("Salvo: {0}", linksDasSubclass.InnerHtml);
}
});
}
private static void BuscaLinksClasses(IEnumerable<HtmlNode> linksDosGrupos)
{
Parallel.ForEach(
linksDosGrupos.Where(x => x.InnerText.Length == 3).Select(x => x.GetAttributeValue("href", "")).Distinct(),
s =>
{
var document = RequestPage("http://www.cnae.ibge.gov.br/" + s);
BuscaLinksSubClasses(ProcuraA(document.DocumentNode));
});
}
private static void BuscaLinksGrupos(IEnumerable<HtmlNode> linksDasDivisoes)
{
Parallel.ForEach(
linksDasDivisoes.Where(x => x.InnerText.Length == 2).Select(x => x.GetAttributeValue("href", "")).Distinct(),
s =>
{
var document = RequestPage("http://www.cnae.ibge.gov.br/" + s);
BuscaLinksClasses(ProcuraA(document.DocumentNode));
});
}
private static IEnumerable<HtmlNode> ProcuraA(HtmlNode documentNode)
{
return documentNode.SelectNodes("//a").Where(x => Regex.IsMatch(x.InnerText, @"^([\d-/. ]+)$"));
}
private static IEnumerable<HtmlNode> ProcuraB(HtmlNode documentNode)
{
return documentNode.SelectNodes("//b");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment