Skip to content

Instantly share code, notes, and snippets.

@DinisCruz
Last active December 29, 2015 08:18
Show Gist options
  • Select an option

  • Save DinisCruz/7642071 to your computer and use it in GitHub Desktop.

Select an option

Save DinisCruz/7642071 to your computer and use it in GitHub Desktop.
//using TeamMentor.CoreLib
//O2Ref:E:\TeamMentor\TM_Releases\Master_3_4\Web Applications\TM_Website\bin\TeamMentor.CoreLib.dll
var tmLibraryDir = @"E:\TeamMentor\TM_Releases\Master_3_4\Library_Data\XmlDatabase\TM_Libraries";
//var topPanel = O2Gui.open<Panel>("{name}",700,400);
var topPanel = panel.clear().add_Panel();
var mappings = new Dictionary<string,List<string>>();
var itemsToProcess = 100000;
var xmlFiles = tmLibraryDir.files("*.xml",true).Take(itemsToProcess);
var xmlFile = xmlFiles.last();
//xmlFile.showInCodeViewer();
var articles = "_articles".o2Cache<List<TeamMentor_Article>>(
()=>{
var start = DateTime.Now;
var _articles = new List<TeamMentor_Article>();
foreach(var file in xmlFiles)
{
if (file.fileName_WithoutExtension().isGuid())
{
var article = file.load<TeamMentor_Article>();
_articles.add(article);
}
else
"skipping file: {0}".info(file);
}
"Loaded in {0} ".debug(start.duration_To_Now());
return _articles;
});
Func<int> getNumberOfUniqueContentBlocks =
()=>{
"calculating getNumberOfUniqueContentBlocks".info();
var uniqueContent = new List<string>();
foreach(var article in articles)
uniqueContent.add_If_Not_There(article.Content.serialize(false));
return uniqueContent.size();
};
Func<List<string>> getAllLines =
()=>{
Func<string,string> getTextFromHtml =
(html)=>{
return html.htmlDocument().DocumentNode.InnerText;
};
var _allLines = new List<string>();
foreach(var article in articles.Take(itemsToProcess))
{
var text = getTextFromHtml(article.Content.Data_Json);
if (text.valid())
{
var lines = text.split("\n").removeEmpty();
_allLines.add(lines);
}
else
"In article :{0}, Content.Data_Json was empty".error(article.Metadata.toXml());
}
return _allLines;
};
Func<List<string>> getAllMetadataValues =
()=>{
var metadataValues = new List<string>();
foreach(var article in articles.Take(itemsToProcess))
metadataValues.AddRange(new[] {article.Metadata.Title , article.Metadata.Category, article.Metadata.Type, article.Metadata.Technology, article.Metadata.Phase} );
return metadataValues.toList();
};
Func<List<string>,List<string>> getWords =
(lines)=>{
return (from line in lines.toList()
where line != null
from word in line.split(" ").removeEmpty()
select word).toList();
};
"Loaded all data: {0} articles".debug(articles.size());
var start2 = DateTime.Now;
var numberOfUniqueContentBlocks = getNumberOfUniqueContentBlocks();
var allLines = getAllLines();
var allDistinctLines = allLines.Distinct().toList();
var numberOflines = allLines.size();
var numberOfUniqueLines = allDistinctLines.size();
var allWords = getWords(allLines);
var allWordsInUniqueLines = getWords(allDistinctLines);
var uniqueWords = allWords.Distinct().toList();
var allMetadataValues = getAllMetadataValues();
var allUniqueMetadataValues = getAllMetadataValues().Distinct().ToList();
var allMetadataWords = getWords(allUniqueMetadataValues).ToList();
var allUniqueMetadataWords = allMetadataWords.Distinct().ToList();
var result = @"
number of xml files: {0:###,###,###}
METADATA:
number of values: {6:###,###,###}
number of unique values: {7:###,###,###}
number of words in unique values: {8:###,###,###}
number of unique words: {9:###,###,###}
HTML CONTENT:
number of unique ContentBlocks: {1:###,###,###}
number of lines: {2:###,###,###}
number of unique lines: {3:###,###,###}
number of words: {4:###,###,###}
number of words In UniqueLines: {10:###,###,###}
number of unique words: {5:###,###,###}
TOTAL:
number of words to translate: {11:###,###,###}
mapped data in {12}
".format(articles.size(),
numberOfUniqueContentBlocks,
numberOflines,
numberOfUniqueLines,
allWords.size(),
uniqueWords.size(),
allMetadataValues.size(),
allUniqueMetadataValues.size(),
allMetadataWords.size(),
allUniqueMetadataWords.size(),
allWordsInUniqueLines.size(),
allWordsInUniqueLines.size() + allMetadataWords.size(),
start2.duration_To_Now());
return result.trim();
//using FluentSharp.For_HtmlAgilityPack
//O2Ref:O2_Misc_Microsoft_MPL_Libs.dll
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment