Last active
December 29, 2015 08:18
-
-
Save DinisCruz/7642071 to your computer and use it in GitHub Desktop.
See https://github.com/TeamMentor/UnitTests/blob/master/LibraryManagement/Calculate%20TM%20article%20totals.h2 for original
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| //using TeamMentor.CoreLib | |
| //O2Ref:E:\TeamMentor\TM_Releases\Master_3_4\Web Applications\TM_Website\bin\TeamMentor.CoreLib.dll | |
| var tmLibraryDir = @"E:\TeamMentor\TM_Releases\Master_3_4\Library_Data\XmlDatabase\TM_Libraries"; | |
| //var topPanel = O2Gui.open<Panel>("{name}",700,400); | |
| var topPanel = panel.clear().add_Panel(); | |
| var mappings = new Dictionary<string,List<string>>(); | |
| var itemsToProcess = 100000; | |
| var xmlFiles = tmLibraryDir.files("*.xml",true).Take(itemsToProcess); | |
| var xmlFile = xmlFiles.last(); | |
| //xmlFile.showInCodeViewer(); | |
| var articles = "_articles".o2Cache<List<TeamMentor_Article>>( | |
| ()=>{ | |
| var start = DateTime.Now; | |
| var _articles = new List<TeamMentor_Article>(); | |
| foreach(var file in xmlFiles) | |
| { | |
| if (file.fileName_WithoutExtension().isGuid()) | |
| { | |
| var article = file.load<TeamMentor_Article>(); | |
| _articles.add(article); | |
| } | |
| else | |
| "skipping file: {0}".info(file); | |
| } | |
| "Loaded in {0} ".debug(start.duration_To_Now()); | |
| return _articles; | |
| }); | |
| Func<int> getNumberOfUniqueContentBlocks = | |
| ()=>{ | |
| "calculating getNumberOfUniqueContentBlocks".info(); | |
| var uniqueContent = new List<string>(); | |
| foreach(var article in articles) | |
| uniqueContent.add_If_Not_There(article.Content.serialize(false)); | |
| return uniqueContent.size(); | |
| }; | |
| Func<List<string>> getAllLines = | |
| ()=>{ | |
| Func<string,string> getTextFromHtml = | |
| (html)=>{ | |
| return html.htmlDocument().DocumentNode.InnerText; | |
| }; | |
| var _allLines = new List<string>(); | |
| foreach(var article in articles.Take(itemsToProcess)) | |
| { | |
| var text = getTextFromHtml(article.Content.Data_Json); | |
| if (text.valid()) | |
| { | |
| var lines = text.split("\n").removeEmpty(); | |
| _allLines.add(lines); | |
| } | |
| else | |
| "In article :{0}, Content.Data_Json was empty".error(article.Metadata.toXml()); | |
| } | |
| return _allLines; | |
| }; | |
| Func<List<string>> getAllMetadataValues = | |
| ()=>{ | |
| var metadataValues = new List<string>(); | |
| foreach(var article in articles.Take(itemsToProcess)) | |
| metadataValues.AddRange(new[] {article.Metadata.Title , article.Metadata.Category, article.Metadata.Type, article.Metadata.Technology, article.Metadata.Phase} ); | |
| return metadataValues.toList(); | |
| }; | |
| Func<List<string>,List<string>> getWords = | |
| (lines)=>{ | |
| return (from line in lines.toList() | |
| where line != null | |
| from word in line.split(" ").removeEmpty() | |
| select word).toList(); | |
| }; | |
| "Loaded all data: {0} articles".debug(articles.size()); | |
| var start2 = DateTime.Now; | |
| var numberOfUniqueContentBlocks = getNumberOfUniqueContentBlocks(); | |
| var allLines = getAllLines(); | |
| var allDistinctLines = allLines.Distinct().toList(); | |
| var numberOflines = allLines.size(); | |
| var numberOfUniqueLines = allDistinctLines.size(); | |
| var allWords = getWords(allLines); | |
| var allWordsInUniqueLines = getWords(allDistinctLines); | |
| var uniqueWords = allWords.Distinct().toList(); | |
| var allMetadataValues = getAllMetadataValues(); | |
| var allUniqueMetadataValues = getAllMetadataValues().Distinct().ToList(); | |
| var allMetadataWords = getWords(allUniqueMetadataValues).ToList(); | |
| var allUniqueMetadataWords = allMetadataWords.Distinct().ToList(); | |
| var result = @" | |
| number of xml files: {0:###,###,###} | |
| METADATA: | |
| number of values: {6:###,###,###} | |
| number of unique values: {7:###,###,###} | |
| number of words in unique values: {8:###,###,###} | |
| number of unique words: {9:###,###,###} | |
| HTML CONTENT: | |
| number of unique ContentBlocks: {1:###,###,###} | |
| number of lines: {2:###,###,###} | |
| number of unique lines: {3:###,###,###} | |
| number of words: {4:###,###,###} | |
| number of words In UniqueLines: {10:###,###,###} | |
| number of unique words: {5:###,###,###} | |
| TOTAL: | |
| number of words to translate: {11:###,###,###} | |
| mapped data in {12} | |
| ".format(articles.size(), | |
| numberOfUniqueContentBlocks, | |
| numberOflines, | |
| numberOfUniqueLines, | |
| allWords.size(), | |
| uniqueWords.size(), | |
| allMetadataValues.size(), | |
| allUniqueMetadataValues.size(), | |
| allMetadataWords.size(), | |
| allUniqueMetadataWords.size(), | |
| allWordsInUniqueLines.size(), | |
| allWordsInUniqueLines.size() + allMetadataWords.size(), | |
| start2.duration_To_Now()); | |
| return result.trim(); | |
| //using FluentSharp.For_HtmlAgilityPack | |
| //O2Ref:O2_Misc_Microsoft_MPL_Libs.dll |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment