Created
May 8, 2020 17:49
-
-
Save lars-erik/ddc6a5c03878b293650d54212b3c8c4b to your computer and use it in GitHub Desktop.
Old fashion free pdf indexing with Umbraco
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Threading.Tasks; | |
using System.Xml.Linq; | |
using Examine; | |
using Examine.LuceneEngine.Providers; | |
using Lucene.Net.Analysis; | |
using Microsoft.WindowsAzure.Storage.Auth; | |
using Microsoft.WindowsAzure.Storage.Blob; | |
using MyCustomer.Core.Infrastructure; | |
using MyCustomer.Core.Magazine; | |
using MyCustomer.Core.Queries; | |
using Directory = Lucene.Net.Store.Directory; | |
namespace MySite.Backoffice.Magazine | |
{ | |
public class MagazineIndexer : LuceneIndexer | |
{ | |
private readonly Directory directory; | |
private readonly IQueryHandler<MagazineQuery> queryHandler; | |
private CloudBlobClient cloudBlobClient = new CloudBlobClient( | |
new Uri("https://mycustomer.blob.core.windows.net"), | |
new StorageCredentials( | |
"storagename", | |
"secret" | |
) | |
); | |
private PDFParser pdfParser = new PDFParser(); | |
public MagazineIndexer() | |
{ | |
queryHandler = Container.Instance.Resolve<IQueryHandler<MagazineQuery>>(); | |
} | |
public MagazineIndexer(Directory directory, IQueryHandler<MagazineQuery> queryHandler, Analyzer analyzer, bool async) | |
: base( | |
new IndexCriteria( | |
Enumerable.Empty<IIndexField>(), | |
Enumerable.Empty<IIndexField>(), | |
Enumerable.Empty<string>(), | |
Enumerable.Empty<string>(), | |
null | |
), | |
directory, | |
analyzer, | |
async | |
) | |
{ | |
this.directory = directory; | |
this.queryHandler = queryHandler; | |
} | |
protected override void PerformIndexAll(string type) | |
{ | |
var result = (IEnumerable<Magazine>)Task.Run(async () => await queryHandler.Execute(new MagazineQuery())).Result; | |
var nodes = result.Select(x => | |
{ | |
var xElement = new XElement("url", x.PDF); | |
xElement.Add(new XAttribute("id", Convert.ToInt32(x.Year) * 100 + Convert.ToInt32(x.Month))); | |
return xElement; | |
}); | |
AddNodesToIndex(nodes, ""); | |
} | |
protected override void PerformIndexRebuild() | |
{ | |
IndexAll(""); | |
} | |
protected override Dictionary<string, string> GetDataToIndex(XElement node, string type) | |
{ | |
try | |
{ | |
var pdf = cloudBlobClient.GetBlobReferenceFromServer(new Uri("https://mycustomer.blob.core.windows.net" + node.Value)); | |
using (var stream = new MemoryStream()) | |
{ | |
pdf.DownloadToStream(stream); | |
stream.Seek(0, SeekOrigin.Begin); | |
var text = pdfParser.GetTextFromAllPages(stream, (e) => { }); | |
var value = node.Attribute("id").Value; | |
var edition = value.Substring(4, 2) + " / " + value.Substring(0, 4); | |
return new Dictionary<string, string> | |
{ | |
{"type", "magasin" }, | |
{"edition", edition }, | |
{"url", node.Value }, | |
{"body", text} | |
}; | |
} | |
} | |
catch | |
{ | |
return new Dictionary<string, string>(); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment