DavidVeksler · August 22, 2016 22:12
diff --git a/ImportController.cs b/ImportController.cs
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.Linq;
 using System.Text;
 using System.Web.Mvc;
 using Archive.FEE.Web.Helper.PDFParser;
 using Umbraco.Core;
 using Umbraco.Core.Logging;
 using Umbraco.Core.Models;
 using Umbraco.Core.Services;
 using Umbraco.Web;
 using Umbraco.Web.Mvc;

 namespace Archive.FEE.Web.App_Code.Controllers
 {
    public class ImportController : SurfaceController
    {
        private static readonly IContentService ContentService = ApplicationContext.Current.Services.ContentService;

        private IEnumerable<IPublishedContent> documents;
        StringBuilder output = new StringBuilder();

        // GET: Import
        // http://local.history.fee.org/umbraco/surface/import/index
        public ActionResult Index()
        {
            //var mediaNode = 1087;
            //var contentNode = 4822; // correspondence
            var mediaNode = int.Parse(Request.QueryString["mediaid"]);
            var contentNode = int.Parse(Request.QueryString["contentid"]); // correspondence
            AddDocumentsForSpecifiedMediaFolder(mediaNode, contentNode);

            return Content(string.Format("<pre>{0}</pre>", output.ToString()));
        }

        private void AddDocumentsForSpecifiedMediaFolder(int mediaNode, int contentNode)
        {
            var mediaFiles = Umbraco.TypedMedia(mediaNode).Children;

            output.AppendLine("media " + mediaFiles.Count() + Environment.NewLine);

            documents = Umbraco.TypedContent(contentNode).Children;

            output.AppendLine("content " + documents.Count() + Environment.NewLine);

            mediaFiles.Where(m => m.DocumentTypeAlias == "Folder").ForEach(folder =>
            {
                Debug.WriteLine(folder.Name);

                if (!documents.Any(d => d.Name == folder.Name))
                {
                    // create new folder:
                    var category = ContentService.CreateContent(folder.Name, contentNode, "category");
                    if (ContentService.SaveAndPublishWithStatus(category))
                    {
                        Debug.WriteLine("Add media in " + folder.Name);
                        output.AppendLine("Add media in " + folder.Name);
                        AddDocumentsForSpecifiedMediaFolder(folder.Id, category.Id);
                    }
                }
                else
                {
                    var category = documents.FirstOrDefault(d => d.Name == folder.Name);
                    output.AppendLine("Add media in " + folder.Name);
                    AddDocumentsForSpecifiedMediaFolder(folder.Id, category.Id);
                }
                
            });

            mediaFiles.Where(m => m.DocumentTypeAlias != "Folder").Reverse().ForEach(file =>
            {
                Debug.WriteLine(file.Name);
                output.AppendLine(file.Name);
                output.AppendLine(CreateContentForMediaFile(contentNode, file));
            });
        }

        private string CreateContentForMediaFile(int parentNodeId, IPublishedContent file)
        {
            IContent document = null;
            try
            {
                var pdfFile = file.Url;

                var meta = PDFParser.GetFileMetadata(pdfFile);

                // check if this document was already added:
                if (documents.Any(d => d.GetPropertyValue<string>("docReferenceNumber") == meta.DocReferenceNumber.ToString()))
                {
                    Debug.WriteLine("document already exists");
                    return "document already exists: " + file.Name;
                }

                document = SetDocumentProperties(parentNodeId, file, meta);

                ContentService.SaveAndPublishWithStatus(document, 0, false);

                Debug.WriteLine("saved" + meta.Title);
                LogHelper.Info<ImportController>("created " + meta.Title);
                return "saved" + meta.Title + Environment.NewLine;

            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex);
                LogHelper.Error<ImportController>("error parsing PDF: ", ex);

                try
                {
                    ContentService.Delete(document);
                }
                catch (Exception ex2)
                {
                    Debug.WriteLine(ex2);
                }
                
                return "no file added:" + ex;
            }

        }

        private static IContent SetDocumentProperties(int parentNodeId, IPublishedContent file, PDFFileMeta meta)
        {
            var document = ContentService.CreateContent(meta.Title, parentNodeId, "Document");
            document.SetValue("Date", meta.DateOfPublication != DateTime.MinValue ? meta.DateOfPublication : meta.Created);

            if (meta.Authors != null)
            {
                // if correspondence:
                if (parentNodeId == 4822)
                {
                    document.SetValue("correspondenceFrom", meta.Authors?[0]);

                    if (meta.Authors.Length > 1)
                    {
                        document.SetValue("correspondenceTo", meta.Authors[1]);
                    }
                }
                else
                {
                    document.SetValue("author", meta.Authors[0]);
                    if (meta.Authors.Length > 1) document.SetValue("author2", meta.Authors[1]);
                    if (meta.Authors.Length > 2) document.SetValue("author3", meta.Authors[2]);
                }
            }

            document.SetValue("originalFile", meta.OriginalFileName);
            document.SetValue("docReferenceNumber", meta.DocReferenceNumber);

            document.SetValue("mainDocument", file.Id);

            document.SetValue("documentHTML", meta.DocumentHTML);

            document.SetValue("publisher", meta.Publisher);

            document.SetValue("Comments", meta.Comments);
            document.SetValue("categoryValue", meta.Category);
            document.SetValue("typeOfDocument", meta.CategorySelect());

            document.CreateDate = meta.Created;
            document.UpdateDate = meta.Modified;
            return document;
        }
    }
 }
diff --git a/PDFParser.cs b/PDFParser.cs
 using System;
 using System.Diagnostics;
 using System.IO;
 using System.Linq;
 using System.Net;
 using iTextSharp.text.pdf;
 using iTextSharp.text.pdf.parser;
 using Umbraco.Core;
 using Umbraco.Core.IO;

 namespace Archive.FEE.Web.Helper.PDFParser
 {
    public class PDFProperties
    {
        public const string Title = "Title";
        public const string TypeofDocument = "Category";
        public const string @Date = "Date of Publication";
        public const string Author = "Author";
        public const string OriginalFile = "Original File";
        public const string Publisher = "Publisher";
        public const string Comments = "Comments";
        public static string Category = "Category";
    }

    public class PDFFileMeta
    {
        public string[] Authors;
        public DateTime DateOfPublication;
        public string Subject;
        public string Title;
        public string Publisher { get; set; }
        public int DocReferenceNumber { get; set; }
        public string Comments { get; set; }
        public string DocumentHTML { get; set; }
        public DateTime Created { get; set; }
        public string OriginalFileName { get; set; }
        public DateTime Modified { get; set; }
        public string Category { get; set; }

        public int CategorySelect()
        {
            switch (Category)
            {
                case "Correspondence":
                    return 0;
                    break;
                case "FEE_Publication":
                    return 1;
                    break;
                case "Personal Files":
                    return 2;
                    break;

                case "Leonard Read Journal":
                case "Leonard E. Read Journal":
                    return 3;
                    break;
                case "Non-FEE Publication":
                    return 4;
                    break;
                default:
                    return -1;
            }
        }

    }



    public static class PDFParser
    {
        private const string BaseUrl = "http://history.fee.org";

        private static readonly MediaFileSystem Media =
            FileSystemProviderManager.Current.GetFileSystemProvider<MediaFileSystem>();

        public static PDFFileMeta GetFileMetadata(string url)
        {
            if (string.IsNullOrWhiteSpace(url))
            {
                throw new ArgumentNullException("file URL is missing");
            }

            var filePath = Media.GetFullPath(url);

            if (!File.Exists(filePath))
            {
                throw new Exception("File does not exist:" + filePath);
                //var fileUrl = BaseUrl + url;

                //var directory = Path.GetDirectoryName(filePath);
                //Directory.CreateDirectory(directory);

                //new WebClient().DownloadFile(fileUrl, filePath);
            }

            ReadPDFInfo(filePath);

            var reader = new MetaDataReader(filePath);

            var fileInfo = new FileInfo(filePath);

            // TODO:
            var meta = new PDFFileMeta
            {
                Title = reader.ReadEntry(PDFProperties.Title),
                Authors = reader.ReadEntry(PDFProperties.Author)?.Split(','),
                Publisher = reader.ReadEntry(PDFProperties.Publisher),
                Comments = reader.ReadEntry(PDFProperties.Comments),
                Category = reader.ReadEntry(PDFProperties.Category),
                DocumentHTML = GetTextFromAllPages(filePath)?.Truncate(100000),
                Created = reader.Created(),
                Modified = reader.Modified(),
                OriginalFileName = fileInfo.Name
            };

            int refNum;
            int.TryParse(fileInfo.Name.Split('-').FirstOrDefault(), out refNum);
            meta.DocReferenceNumber = refNum;

            DateTime dop;
            if (DateTime.TryParse(reader.ReadEntry(PDFProperties.Date), out dop))
            {
                meta.DateOfPublication = dop;
            }

            if (string.IsNullOrWhiteSpace(meta.Title))
            {
                throw new ArgumentNullException("Title for " + filePath);
            }

            return meta;
        }




        public static void ReadPDFInfo(string path)
        {
            var reader = new PdfReader(path);
            foreach (var b in reader.Info)
            {
                Debug.WriteLine(b.Key + ": " + b.Value);
            }
        }

        public static string GetTextFromAllPages(string pdfPath)
        {
            PdfReader reader = new PdfReader(pdfPath);

            StringWriter output = new StringWriter();

            for (int i = 1; i <= reader.NumberOfPages; i++)
                output.WriteLine(PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy()));

            return output.ToString();
        }
    }



    internal class MetaDataReader
    {
        private readonly PdfReader _reader;

        public MetaDataReader(string pdfPath)
        {
            _reader = new PdfReader(pdfPath);
        }

        public string ReadEntry(string key)
        {
            string value;
            _reader.Info.TryGetValue(key, out value);
            if (string.IsNullOrWhiteSpace(value)) // bad data
            {
                _reader.Info.TryGetValue(key + " ", out value);
            }
            return value;
        }

        public DateTime Created()
        {
            string creationDate;
            _reader.Info.TryGetValue("CreationDate", out creationDate);
            return ParsePDFDate(creationDate);
        }

        private DateTime ParsePDFDate(string date)
        {
            if (string.IsNullOrWhiteSpace(date))
            {
                return DateTime.Now;
            }

            System.Globalization.CultureInfo provider = System.Globalization.CultureInfo.InvariantCulture;
            try
            {
                return date.EndsWith("Z") ? DateTime.ParseExact(date, "D:yyyyMMddHHmmssZ", provider) : DateTime.ParseExact(date.Split('-').First(), "D:yyyyMMddHHmmss", provider);
            }
            catch
            {
                try
                {
                    return DateTime.ParseExact(date, "D:yyyyMMddHHmmss", provider);
                }
                catch
                {

                    try
                    {
                        return DateTime.ParseExact(date, "D:yyyyMMddHHmmssZ", provider);
                    }
                    catch
                    {
                        return DateTime.Now;
                    }
                }
            }

        }

        public DateTime Modified()
        {
            string modDate;
            _reader.Info.TryGetValue("ModDate", out modDate);
            return ParsePDFDate(modDate);
        }

    }
 }
	using System;
	using System.Collections.Generic;
	using System.Diagnostics;
	using System.Linq;
	using System.Text;
	using System.Web.Mvc;
	using Archive.FEE.Web.Helper.PDFParser;
	using Umbraco.Core;
	using Umbraco.Core.Logging;
	using Umbraco.Core.Models;
	using Umbraco.Core.Services;
	using Umbraco.Web;
	using Umbraco.Web.Mvc;

	namespace Archive.FEE.Web.App_Code.Controllers
	{
	public class ImportController : SurfaceController
	{
	private static readonly IContentService ContentService = ApplicationContext.Current.Services.ContentService;

	private IEnumerable<IPublishedContent> documents;
	StringBuilder output = new StringBuilder();

	// GET: Import
	// http://local.history.fee.org/umbraco/surface/import/index
	public ActionResult Index()
	{
	//var mediaNode = 1087;
	//var contentNode = 4822; // correspondence
	var mediaNode = int.Parse(Request.QueryString["mediaid"]);
	var contentNode = int.Parse(Request.QueryString["contentid"]); // correspondence
	AddDocumentsForSpecifiedMediaFolder(mediaNode, contentNode);

	return Content(string.Format("<pre>{0}</pre>", output.ToString()));
	}

	private void AddDocumentsForSpecifiedMediaFolder(int mediaNode, int contentNode)
	{
	var mediaFiles = Umbraco.TypedMedia(mediaNode).Children;

	output.AppendLine("media " + mediaFiles.Count() + Environment.NewLine);

	documents = Umbraco.TypedContent(contentNode).Children;

	output.AppendLine("content " + documents.Count() + Environment.NewLine);

	mediaFiles.Where(m => m.DocumentTypeAlias == "Folder").ForEach(folder =>
	{
	Debug.WriteLine(folder.Name);

	if (!documents.Any(d => d.Name == folder.Name))
	{
	// create new folder:
	var category = ContentService.CreateContent(folder.Name, contentNode, "category");
	if (ContentService.SaveAndPublishWithStatus(category))
	{
	Debug.WriteLine("Add media in " + folder.Name);
	output.AppendLine("Add media in " + folder.Name);
	AddDocumentsForSpecifiedMediaFolder(folder.Id, category.Id);
	}
	}
	else
	{
	var category = documents.FirstOrDefault(d => d.Name == folder.Name);
	output.AppendLine("Add media in " + folder.Name);
	AddDocumentsForSpecifiedMediaFolder(folder.Id, category.Id);
	}

	});

	mediaFiles.Where(m => m.DocumentTypeAlias != "Folder").Reverse().ForEach(file =>
	{
	Debug.WriteLine(file.Name);
	output.AppendLine(file.Name);
	output.AppendLine(CreateContentForMediaFile(contentNode, file));
	});
	}

	private string CreateContentForMediaFile(int parentNodeId, IPublishedContent file)
	{
	IContent document = null;
	try
	{
	var pdfFile = file.Url;

	var meta = PDFParser.GetFileMetadata(pdfFile);

	// check if this document was already added:
	if (documents.Any(d => d.GetPropertyValue<string>("docReferenceNumber") == meta.DocReferenceNumber.ToString()))
	{
	Debug.WriteLine("document already exists");
	return "document already exists: " + file.Name;
	}

	document = SetDocumentProperties(parentNodeId, file, meta);

	ContentService.SaveAndPublishWithStatus(document, 0, false);

	Debug.WriteLine("saved" + meta.Title);
	LogHelper.Info<ImportController>("created " + meta.Title);
	return "saved" + meta.Title + Environment.NewLine;

	}
	catch (Exception ex)
	{
	Debug.WriteLine(ex);
	LogHelper.Error<ImportController>("error parsing PDF: ", ex);

	try
	{
	ContentService.Delete(document);
	}
	catch (Exception ex2)
	{
	Debug.WriteLine(ex2);
	}

	return "no file added:" + ex;
	}

	}

	private static IContent SetDocumentProperties(int parentNodeId, IPublishedContent file, PDFFileMeta meta)
	{
	var document = ContentService.CreateContent(meta.Title, parentNodeId, "Document");
	document.SetValue("Date", meta.DateOfPublication != DateTime.MinValue ? meta.DateOfPublication : meta.Created);

	if (meta.Authors != null)
	{
	// if correspondence:
	if (parentNodeId == 4822)
	{
	document.SetValue("correspondenceFrom", meta.Authors?[0]);

	if (meta.Authors.Length > 1)
	{
	document.SetValue("correspondenceTo", meta.Authors[1]);
	}
	}
	else
	{
	document.SetValue("author", meta.Authors[0]);
	if (meta.Authors.Length > 1) document.SetValue("author2", meta.Authors[1]);
	if (meta.Authors.Length > 2) document.SetValue("author3", meta.Authors[2]);
	}
	}

	document.SetValue("originalFile", meta.OriginalFileName);
	document.SetValue("docReferenceNumber", meta.DocReferenceNumber);

	document.SetValue("mainDocument", file.Id);

	document.SetValue("documentHTML", meta.DocumentHTML);

	document.SetValue("publisher", meta.Publisher);

	document.SetValue("Comments", meta.Comments);
	document.SetValue("categoryValue", meta.Category);
	document.SetValue("typeOfDocument", meta.CategorySelect());

	document.CreateDate = meta.Created;
	document.UpdateDate = meta.Modified;
	return document;
	}
	}
	}
	using System;
	using System.Diagnostics;
	using System.IO;
	using System.Linq;
	using System.Net;
	using iTextSharp.text.pdf;
	using iTextSharp.text.pdf.parser;
	using Umbraco.Core;
	using Umbraco.Core.IO;

	namespace Archive.FEE.Web.Helper.PDFParser
	{
	public class PDFProperties
	{
	public const string Title = "Title";
	public const string TypeofDocument = "Category";
	public const string @Date = "Date of Publication";
	public const string Author = "Author";
	public const string OriginalFile = "Original File";
	public const string Publisher = "Publisher";
	public const string Comments = "Comments";
	public static string Category = "Category";
	}

	public class PDFFileMeta
	{
	public string[] Authors;
	public DateTime DateOfPublication;
	public string Subject;
	public string Title;
	public string Publisher { get; set; }
	public int DocReferenceNumber { get; set; }
	public string Comments { get; set; }
	public string DocumentHTML { get; set; }
	public DateTime Created { get; set; }
	public string OriginalFileName { get; set; }
	public DateTime Modified { get; set; }
	public string Category { get; set; }

	public int CategorySelect()
	{
	switch (Category)
	{
	case "Correspondence":
	return 0;
	break;
	case "FEE_Publication":
	return 1;
	break;
	case "Personal Files":
	return 2;
	break;

	case "Leonard Read Journal":
	case "Leonard E. Read Journal":
	return 3;
	break;
	case "Non-FEE Publication":
	return 4;
	break;
	default:
	return -1;
	}
	}

	}



	public static class PDFParser
	{
	private const string BaseUrl = "http://history.fee.org";

	private static readonly MediaFileSystem Media =
	FileSystemProviderManager.Current.GetFileSystemProvider<MediaFileSystem>();

	public static PDFFileMeta GetFileMetadata(string url)
	{
	if (string.IsNullOrWhiteSpace(url))
	{
	throw new ArgumentNullException("file URL is missing");
	}

	var filePath = Media.GetFullPath(url);

	if (!File.Exists(filePath))
	{
	throw new Exception("File does not exist:" + filePath);
	//var fileUrl = BaseUrl + url;

	//var directory = Path.GetDirectoryName(filePath);
	//Directory.CreateDirectory(directory);

	//new WebClient().DownloadFile(fileUrl, filePath);
	}

	ReadPDFInfo(filePath);

	var reader = new MetaDataReader(filePath);

	var fileInfo = new FileInfo(filePath);

	// TODO:
	var meta = new PDFFileMeta
	{
	Title = reader.ReadEntry(PDFProperties.Title),
	Authors = reader.ReadEntry(PDFProperties.Author)?.Split(','),
	Publisher = reader.ReadEntry(PDFProperties.Publisher),
	Comments = reader.ReadEntry(PDFProperties.Comments),
	Category = reader.ReadEntry(PDFProperties.Category),
	DocumentHTML = GetTextFromAllPages(filePath)?.Truncate(100000),
	Created = reader.Created(),
	Modified = reader.Modified(),
	OriginalFileName = fileInfo.Name
	};

	int refNum;
	int.TryParse(fileInfo.Name.Split('-').FirstOrDefault(), out refNum);
	meta.DocReferenceNumber = refNum;

	DateTime dop;
	if (DateTime.TryParse(reader.ReadEntry(PDFProperties.Date), out dop))
	{
	meta.DateOfPublication = dop;
	}

	if (string.IsNullOrWhiteSpace(meta.Title))
	{
	throw new ArgumentNullException("Title for " + filePath);
	}

	return meta;
	}




	public static void ReadPDFInfo(string path)
	{
	var reader = new PdfReader(path);
	foreach (var b in reader.Info)
	{
	Debug.WriteLine(b.Key + ": " + b.Value);
	}
	}

	public static string GetTextFromAllPages(string pdfPath)
	{
	PdfReader reader = new PdfReader(pdfPath);

	StringWriter output = new StringWriter();

	for (int i = 1; i <= reader.NumberOfPages; i++)
	output.WriteLine(PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy()));

	return output.ToString();
	}
	}



	internal class MetaDataReader
	{
	private readonly PdfReader _reader;

	public MetaDataReader(string pdfPath)
	{
	_reader = new PdfReader(pdfPath);
	}

	public string ReadEntry(string key)
	{
	string value;
	_reader.Info.TryGetValue(key, out value);
	if (string.IsNullOrWhiteSpace(value)) // bad data
	{
	_reader.Info.TryGetValue(key + " ", out value);
	}
	return value;
	}

	public DateTime Created()
	{
	string creationDate;
	_reader.Info.TryGetValue("CreationDate", out creationDate);
	return ParsePDFDate(creationDate);
	}

	private DateTime ParsePDFDate(string date)
	{
	if (string.IsNullOrWhiteSpace(date))
	{
	return DateTime.Now;
	}

	System.Globalization.CultureInfo provider = System.Globalization.CultureInfo.InvariantCulture;
	try
	{
	return date.EndsWith("Z") ? DateTime.ParseExact(date, "D:yyyyMMddHHmmssZ", provider) : DateTime.ParseExact(date.Split('-').First(), "D:yyyyMMddHHmmss", provider);
	}
	catch
	{
	try
	{
	return DateTime.ParseExact(date, "D:yyyyMMddHHmmss", provider);
	}
	catch
	{

	try
	{
	return DateTime.ParseExact(date, "D:yyyyMMddHHmmssZ", provider);
	}
	catch
	{
	return DateTime.Now;
	}
	}
	}

	}

	public DateTime Modified()
	{
	string modDate;
	_reader.Info.TryGetValue("ModDate", out modDate);
	return ParsePDFDate(modDate);
	}

	}
	}