Created
August 22, 2016 22:12
-
-
Save DavidVeksler/37ba1a3cec153d5dda3efb1d07a507e9 to your computer and use it in GitHub Desktop.
How FEE digitized and shared 70 years of archives on the Web: an Umbraco case study
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Diagnostics; | |
using System.Linq; | |
using System.Text; | |
using System.Web.Mvc; | |
using Archive.FEE.Web.Helper.PDFParser; | |
using Umbraco.Core; | |
using Umbraco.Core.Logging; | |
using Umbraco.Core.Models; | |
using Umbraco.Core.Services; | |
using Umbraco.Web; | |
using Umbraco.Web.Mvc; | |
namespace Archive.FEE.Web.App_Code.Controllers | |
{ | |
public class ImportController : SurfaceController | |
{ | |
private static readonly IContentService ContentService = ApplicationContext.Current.Services.ContentService; | |
private IEnumerable<IPublishedContent> documents; | |
StringBuilder output = new StringBuilder(); | |
// GET: Import | |
// http://local.history.fee.org/umbraco/surface/import/index | |
public ActionResult Index() | |
{ | |
//var mediaNode = 1087; | |
//var contentNode = 4822; // correspondence | |
var mediaNode = int.Parse(Request.QueryString["mediaid"]); | |
var contentNode = int.Parse(Request.QueryString["contentid"]); // correspondence | |
AddDocumentsForSpecifiedMediaFolder(mediaNode, contentNode); | |
return Content(string.Format("<pre>{0}</pre>", output.ToString())); | |
} | |
private void AddDocumentsForSpecifiedMediaFolder(int mediaNode, int contentNode) | |
{ | |
var mediaFiles = Umbraco.TypedMedia(mediaNode).Children; | |
output.AppendLine("media " + mediaFiles.Count() + Environment.NewLine); | |
documents = Umbraco.TypedContent(contentNode).Children; | |
output.AppendLine("content " + documents.Count() + Environment.NewLine); | |
mediaFiles.Where(m => m.DocumentTypeAlias == "Folder").ForEach(folder => | |
{ | |
Debug.WriteLine(folder.Name); | |
if (!documents.Any(d => d.Name == folder.Name)) | |
{ | |
// create new folder: | |
var category = ContentService.CreateContent(folder.Name, contentNode, "category"); | |
if (ContentService.SaveAndPublishWithStatus(category)) | |
{ | |
Debug.WriteLine("Add media in " + folder.Name); | |
output.AppendLine("Add media in " + folder.Name); | |
AddDocumentsForSpecifiedMediaFolder(folder.Id, category.Id); | |
} | |
} | |
else | |
{ | |
var category = documents.FirstOrDefault(d => d.Name == folder.Name); | |
output.AppendLine("Add media in " + folder.Name); | |
AddDocumentsForSpecifiedMediaFolder(folder.Id, category.Id); | |
} | |
}); | |
mediaFiles.Where(m => m.DocumentTypeAlias != "Folder").Reverse().ForEach(file => | |
{ | |
Debug.WriteLine(file.Name); | |
output.AppendLine(file.Name); | |
output.AppendLine(CreateContentForMediaFile(contentNode, file)); | |
}); | |
} | |
private string CreateContentForMediaFile(int parentNodeId, IPublishedContent file) | |
{ | |
IContent document = null; | |
try | |
{ | |
var pdfFile = file.Url; | |
var meta = PDFParser.GetFileMetadata(pdfFile); | |
// check if this document was already added: | |
if (documents.Any(d => d.GetPropertyValue<string>("docReferenceNumber") == meta.DocReferenceNumber.ToString())) | |
{ | |
Debug.WriteLine("document already exists"); | |
return "document already exists: " + file.Name; | |
} | |
document = SetDocumentProperties(parentNodeId, file, meta); | |
ContentService.SaveAndPublishWithStatus(document, 0, false); | |
Debug.WriteLine("saved" + meta.Title); | |
LogHelper.Info<ImportController>("created " + meta.Title); | |
return "saved" + meta.Title + Environment.NewLine; | |
} | |
catch (Exception ex) | |
{ | |
Debug.WriteLine(ex); | |
LogHelper.Error<ImportController>("error parsing PDF: ", ex); | |
try | |
{ | |
ContentService.Delete(document); | |
} | |
catch (Exception ex2) | |
{ | |
Debug.WriteLine(ex2); | |
} | |
return "no file added:" + ex; | |
} | |
} | |
private static IContent SetDocumentProperties(int parentNodeId, IPublishedContent file, PDFFileMeta meta) | |
{ | |
var document = ContentService.CreateContent(meta.Title, parentNodeId, "Document"); | |
document.SetValue("Date", meta.DateOfPublication != DateTime.MinValue ? meta.DateOfPublication : meta.Created); | |
if (meta.Authors != null) | |
{ | |
// if correspondence: | |
if (parentNodeId == 4822) | |
{ | |
document.SetValue("correspondenceFrom", meta.Authors?[0]); | |
if (meta.Authors.Length > 1) | |
{ | |
document.SetValue("correspondenceTo", meta.Authors[1]); | |
} | |
} | |
else | |
{ | |
document.SetValue("author", meta.Authors[0]); | |
if (meta.Authors.Length > 1) document.SetValue("author2", meta.Authors[1]); | |
if (meta.Authors.Length > 2) document.SetValue("author3", meta.Authors[2]); | |
} | |
} | |
document.SetValue("originalFile", meta.OriginalFileName); | |
document.SetValue("docReferenceNumber", meta.DocReferenceNumber); | |
document.SetValue("mainDocument", file.Id); | |
document.SetValue("documentHTML", meta.DocumentHTML); | |
document.SetValue("publisher", meta.Publisher); | |
document.SetValue("Comments", meta.Comments); | |
document.SetValue("categoryValue", meta.Category); | |
document.SetValue("typeOfDocument", meta.CategorySelect()); | |
document.CreateDate = meta.Created; | |
document.UpdateDate = meta.Modified; | |
return document; | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Diagnostics; | |
using System.IO; | |
using System.Linq; | |
using System.Net; | |
using iTextSharp.text.pdf; | |
using iTextSharp.text.pdf.parser; | |
using Umbraco.Core; | |
using Umbraco.Core.IO; | |
namespace Archive.FEE.Web.Helper.PDFParser | |
{ | |
public class PDFProperties | |
{ | |
public const string Title = "Title"; | |
public const string TypeofDocument = "Category"; | |
public const string @Date = "Date of Publication"; | |
public const string Author = "Author"; | |
public const string OriginalFile = "Original File"; | |
public const string Publisher = "Publisher"; | |
public const string Comments = "Comments"; | |
public static string Category = "Category"; | |
} | |
public class PDFFileMeta | |
{ | |
public string[] Authors; | |
public DateTime DateOfPublication; | |
public string Subject; | |
public string Title; | |
public string Publisher { get; set; } | |
public int DocReferenceNumber { get; set; } | |
public string Comments { get; set; } | |
public string DocumentHTML { get; set; } | |
public DateTime Created { get; set; } | |
public string OriginalFileName { get; set; } | |
public DateTime Modified { get; set; } | |
public string Category { get; set; } | |
public int CategorySelect() | |
{ | |
switch (Category) | |
{ | |
case "Correspondence": | |
return 0; | |
break; | |
case "FEE_Publication": | |
return 1; | |
break; | |
case "Personal Files": | |
return 2; | |
break; | |
case "Leonard Read Journal": | |
case "Leonard E. Read Journal": | |
return 3; | |
break; | |
case "Non-FEE Publication": | |
return 4; | |
break; | |
default: | |
return -1; | |
} | |
} | |
} | |
public static class PDFParser | |
{ | |
private const string BaseUrl = "http://history.fee.org"; | |
private static readonly MediaFileSystem Media = | |
FileSystemProviderManager.Current.GetFileSystemProvider<MediaFileSystem>(); | |
public static PDFFileMeta GetFileMetadata(string url) | |
{ | |
if (string.IsNullOrWhiteSpace(url)) | |
{ | |
throw new ArgumentNullException("file URL is missing"); | |
} | |
var filePath = Media.GetFullPath(url); | |
if (!File.Exists(filePath)) | |
{ | |
throw new Exception("File does not exist:" + filePath); | |
//var fileUrl = BaseUrl + url; | |
//var directory = Path.GetDirectoryName(filePath); | |
//Directory.CreateDirectory(directory); | |
//new WebClient().DownloadFile(fileUrl, filePath); | |
} | |
ReadPDFInfo(filePath); | |
var reader = new MetaDataReader(filePath); | |
var fileInfo = new FileInfo(filePath); | |
// TODO: | |
var meta = new PDFFileMeta | |
{ | |
Title = reader.ReadEntry(PDFProperties.Title), | |
Authors = reader.ReadEntry(PDFProperties.Author)?.Split(','), | |
Publisher = reader.ReadEntry(PDFProperties.Publisher), | |
Comments = reader.ReadEntry(PDFProperties.Comments), | |
Category = reader.ReadEntry(PDFProperties.Category), | |
DocumentHTML = GetTextFromAllPages(filePath)?.Truncate(100000), | |
Created = reader.Created(), | |
Modified = reader.Modified(), | |
OriginalFileName = fileInfo.Name | |
}; | |
int refNum; | |
int.TryParse(fileInfo.Name.Split('-').FirstOrDefault(), out refNum); | |
meta.DocReferenceNumber = refNum; | |
DateTime dop; | |
if (DateTime.TryParse(reader.ReadEntry(PDFProperties.Date), out dop)) | |
{ | |
meta.DateOfPublication = dop; | |
} | |
if (string.IsNullOrWhiteSpace(meta.Title)) | |
{ | |
throw new ArgumentNullException("Title for " + filePath); | |
} | |
return meta; | |
} | |
public static void ReadPDFInfo(string path) | |
{ | |
var reader = new PdfReader(path); | |
foreach (var b in reader.Info) | |
{ | |
Debug.WriteLine(b.Key + ": " + b.Value); | |
} | |
} | |
public static string GetTextFromAllPages(string pdfPath) | |
{ | |
PdfReader reader = new PdfReader(pdfPath); | |
StringWriter output = new StringWriter(); | |
for (int i = 1; i <= reader.NumberOfPages; i++) | |
output.WriteLine(PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy())); | |
return output.ToString(); | |
} | |
} | |
internal class MetaDataReader | |
{ | |
private readonly PdfReader _reader; | |
public MetaDataReader(string pdfPath) | |
{ | |
_reader = new PdfReader(pdfPath); | |
} | |
public string ReadEntry(string key) | |
{ | |
string value; | |
_reader.Info.TryGetValue(key, out value); | |
if (string.IsNullOrWhiteSpace(value)) // bad data | |
{ | |
_reader.Info.TryGetValue(key + " ", out value); | |
} | |
return value; | |
} | |
public DateTime Created() | |
{ | |
string creationDate; | |
_reader.Info.TryGetValue("CreationDate", out creationDate); | |
return ParsePDFDate(creationDate); | |
} | |
private DateTime ParsePDFDate(string date) | |
{ | |
if (string.IsNullOrWhiteSpace(date)) | |
{ | |
return DateTime.Now; | |
} | |
System.Globalization.CultureInfo provider = System.Globalization.CultureInfo.InvariantCulture; | |
try | |
{ | |
return date.EndsWith("Z") ? DateTime.ParseExact(date, "D:yyyyMMddHHmmssZ", provider) : DateTime.ParseExact(date.Split('-').First(), "D:yyyyMMddHHmmss", provider); | |
} | |
catch | |
{ | |
try | |
{ | |
return DateTime.ParseExact(date, "D:yyyyMMddHHmmss", provider); | |
} | |
catch | |
{ | |
try | |
{ | |
return DateTime.ParseExact(date, "D:yyyyMMddHHmmssZ", provider); | |
} | |
catch | |
{ | |
return DateTime.Now; | |
} | |
} | |
} | |
} | |
public DateTime Modified() | |
{ | |
string modDate; | |
_reader.Info.TryGetValue("ModDate", out modDate); | |
return ParsePDFDate(modDate); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment