Last active
March 14, 2017 12:02
-
-
Save jakkaj/f69b64d5217be3192c84f398741190a9 to your computer and use it in GitHub Desktop.
Parse WebVTT to POCO ready for indexing using Azure Search.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static List<IndexText> Parse(string vtt, string videoId){ | |
string[] lines = vtt.Split(new string[] { "\r\n", "\n" }, StringSplitOptions.None); | |
var tcStart = default(TimeSpan); | |
var tcEnd = default(TimeSpan); | |
var indexList = new List<IndexText>(); | |
foreach (var l in lines) | |
{ | |
if (l == "WEBVTT" || string.IsNullOrWhiteSpace(l)) | |
{ | |
continue; | |
} | |
if (l.IndexOf("-->") != -1) | |
{ | |
//this is a timecode | |
var tc = l.Replace("-->", "|").Split('|'); | |
tcStart = TimeSpan.Parse(tc[0]); | |
tcEnd = TimeSpan.Parse(tc[1]); | |
} | |
else | |
{ | |
var s = tcStart.TotalMilliseconds.ToString(); | |
var e = tcEnd.TotalMilliseconds.ToString(); | |
//this is text | |
var idx = new IndexText | |
{ | |
Text = l, | |
Start = s, | |
End = e, | |
IndexId = videoId + s + e, | |
VideoId = videoId | |
}; | |
indexList.Add(idx); | |
} | |
} | |
return indexList; | |
} | |
public class IndexText | |
{ | |
[Key] | |
[IsFilterable] | |
public string IndexId { get; set; } | |
public string Start { get; set; } | |
public string End { get; set; } | |
[IsSearchable] | |
[Analyzer(AnalyzerName.AsString.EnLucene)] | |
public string Text { get; set; } | |
[IsSortable] | |
[IsFilterable] | |
[IsSearchable] | |
public string VideoId { get; set; } | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment