Last active
August 30, 2017 20:09
-
-
Save sebnyberg/428ee3e73dd71b45a0c1c27351f45014 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using DataReader.Extensions; | |
using iTextSharp.text.log; | |
using iTextSharp.text.pdf; | |
using iTextSharp.text.pdf.parser; | |
namespace DataReader.ExploredCode | |
{ | |
internal class OldProgram : IApplication | |
{ | |
private readonly ILogger _logger; | |
public OldProgram(ILogger logger) | |
{ | |
_logger = logger; | |
} | |
public void Run() | |
{ | |
// MANUALLY ENTER VALUES FOR VARIABLES BELOW | (current values fit 2017_TX_5T_EX.pdf) | |
//const string documentPath = @"C:\Users\Tetratrio\insurancecoverage\src\InsuranceCoverage.DataReader\docs\"; | |
var basePath = AppDomain.CurrentDomain.BaseDirectory; | |
const string pdfFilename = @"Docs\2017_TX_5T_EX.pdf"; | |
const string csvFilename = @"KaiserDrugTierList.csv"; | |
// Used to extract text from the pages with the tables | |
const int firstPageTable = 15; | |
// Used to extract text from the pages with the tables | |
const int lastPageTable = 159; | |
// Check manually, looking for the last pagenumber found in textfile | |
const int pageNumberOfLastPageOfTables = 145; | |
//################################################################################### | |
var pdfReader = new PdfReader(basePath + pdfFilename); | |
var text = ReadPdfAsText(pdfReader, firstPageTable, lastPageTable); | |
// String ends with page-number + '\n' | |
var textToRemove = "2017\n" + | |
"Texas Residents -- Find and estimate prices for medicines on this formulary at:\n" + | |
"https://www.myprime.com/v/BCBSTX/COMMERCIAL/TXMKTGNPLS/en/find-medicine.html\n" + | |
"BCBSTX Health Insurance Marketplace 5 Tier Drug List July 2017 "; | |
for (var i = 1; i <= pageNumberOfLastPageOfTables; i++) | |
{ | |
text = text.Replace(textToRemove + i + '\n', string.Empty); | |
} | |
textToRemove = "Drug Name\n" + | |
"Drug Tier\n" + | |
"Prior Authorization\n" + | |
"Step Therapy\n" + | |
"Dispensing Limits\n" + | |
"ACA\n" + | |
"Limited Distribution\n"; | |
text = text.Replace(textToRemove, string.Empty); | |
// Remove all dots | |
text = text.Replace("•", string.Empty); | |
// Remember where all new lines were | |
text = text.Replace("\n", " $ "); | |
char[] separators = { ' ' }; | |
var words = text.Split(separators); | |
//var drugList = new List<Tuple<string, string>>(); | |
var drugs = new List<(string name, string tier)>(); | |
// var counter = 0; | |
for (var index = 0; index < words.Length;) | |
{ | |
// Find first word that isnt all caps. | |
// Can be lowercase or just a '-' (which is included in all drugs that start with upper case words) | |
var stringBuilder = new StringBuilder(); | |
// Ignoring all words "$" which marks newlines | |
for (; words[index].Equals("$") || index < words.Length && words[index].IsUpper(); ++index) | |
{ } | |
if (index >= words.Length) | |
break; | |
var retrieveWordsUpToIndex = -1; | |
// Weird rule: Take all words from the first line moving backwards that contains a word with alphabetic character and length > 1. | |
if (words[index].Equals("-")) | |
{ | |
retrieveWordsUpToIndex = index; | |
// Go backwards and find first word with alphabetic char in it. | |
for (index -= 1; words[index].ContainsAlphabeticCharacter(); index--) | |
{ } | |
// Go backwards until at start of current line | |
for (; !words[index - 1].Equals("$"); index--) | |
{ } | |
} | |
for (; index <= retrieveWordsUpToIndex; index++) | |
{ | |
if (words[index].Equals("$")) | |
continue; | |
stringBuilder.Append(words[index]); | |
stringBuilder.Append(' '); | |
} | |
// Apparently checks if word is a digit | |
for (; index < words.Length && (!IsTier(words, index) || words[index].Equals("$")); ++index) | |
{ | |
if (words[index].Equals("$")) | |
continue; | |
stringBuilder.Append(words[index]); | |
stringBuilder.Append(' '); | |
} | |
if (index >= words.Length) | |
break; | |
//drugs.Add(new Tuple<string, string>(stringBuilder.ToString(), words[index])); | |
drugs.Add((stringBuilder.ToString(), words[index])); | |
stringBuilder.Clear(); | |
// i should currently point at the tier for a drug and should thus be incremented once | |
++index; | |
} | |
var sb = new StringBuilder(); | |
foreach (var drug in drugs) | |
{ | |
//sB.AppendLine("\"" + drug.Item1 + "\",\"" + drug.Item2 + "\""); | |
sb.AppendLine("\"" + drug.name + "\",\"" + drug.tier + "\""); | |
} | |
// Add any drugs from the 'tail' of the table that was cut off | |
sb.AppendLine("\"" + "ZORTRESS - everolimus tab 0.75 mg" + "\",\"" + "4" + "\""); | |
System.IO.File.WriteAllText(basePath + csvFilename, sb.ToString()); | |
Console.WriteLine("Extracted information from " + drugs.Count + " drugs."); | |
Console.ReadKey(); | |
} | |
private static bool IsTier(IReadOnlyList<string> words, int index) | |
{ | |
string[] tiers = { "1", "2", "3", "4", "5", "1,2", "A" }; | |
// Special case for last entry in pdf table | |
if (index == words.Count - 1) | |
if (words[index - 1].Equals("$")) | |
return tiers.Contains(words[index]); | |
// Normal case | |
if (words[index - 1].Equals("$") && words[index + 1].Equals("$")) | |
return tiers.Contains(words[index]); | |
return false; | |
} | |
private static string ReadPdfAsText(PdfReader pdfReader, int firstPage, int lastPage) | |
{ | |
var stringBuilder = new StringBuilder(); | |
for (var currentPage = firstPage; currentPage <= lastPage; currentPage++) | |
{ | |
var simpleTextExtractionStrategy = new SimpleTextExtractionStrategy(); | |
var pageText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, simpleTextExtractionStrategy); | |
pageText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(pageText))); | |
stringBuilder.Append(pageText); | |
} | |
return stringBuilder.ToString(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment