Created
May 5, 2018 04:50
-
-
Save Sofronio/1d1bf8d9d5bcc59fb1b88fc3e9c0d150 to your computer and use it in GitHub Desktop.
YouTube2Subtitle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private void processXML(string filesWithPath) | |
{ | |
if (filesWithPath != "") | |
{ | |
mySubtitle.Clear(); | |
XmlDocument myXMLDoc = new XmlDocument(); | |
myXMLDoc.Load(filesWithPath); | |
var ps = myXMLDoc.GetElementsByTagName("p"); | |
//each p is a TimedLine | |
//p with content, contains p.LineStart, p.LineEnd is p.Next.LineStart. | |
//p without content, contains only p.LineStart, providing p.Pre.LineEnd. | |
//p with content could be the last line. | |
//p without content could not be the last line. 2018-05-02 | |
int psFlag = 0; | |
int sPosition = 0; | |
string sContent; | |
string sWordStart; | |
string sWordEnd; | |
int sLineNumber = 0; | |
string sColor = ""; | |
string sLastColor = ""; | |
string sLineColor = ""; | |
foreach (XmlNode p in ps) | |
{ | |
if (p.ChildNodes.Count != 0) | |
{ | |
//Debug.WriteLine(p.InnerXml); | |
sLineNumber++; | |
if (p.InnerXml.Contains("<")) | |
{ | |
if (p.Attributes["t"] != null) | |
{ | |
//each s is a content, which may contain "t" as s.WordStart | |
//the first s in p doesn't contain "t", so ss[0].WordStart is LineStart | |
//other s contains "t", as WordStart | |
//s could be the last in p, s.WordEnd is p.LineEnd, which is p.Next.LineStart | |
if (p.Attributes["p"] != null) | |
{ | |
//<pen id="1" fc="#E5E5E5"/> | |
//<pen id="2" fc="#CCCCCC"/> | |
//if not said, it'll be id="1" fc="#E5E5E5" | |
switch (p.Attributes["p"].Value.ToString()) | |
{ | |
case "1": | |
sLineColor = "E5E5E5"; | |
break; | |
case "2": | |
sLineColor = "CCCCCC"; | |
break; | |
default: | |
sLineColor = "E5E5E5"; | |
break; | |
} | |
} | |
else | |
sLineColor = "E5E5E5"; | |
var ss = p.ChildNodes; | |
int ssFlag = 0; | |
foreach (XmlNode s in ss) | |
{ | |
sContent = s.InnerText.Trim(); | |
if (s.Attributes["t"] == null) | |
{ | |
if (s.Attributes["p"] != null) | |
{ | |
//<pen id="1" fc="#E5E5E5"/> | |
//<pen id="2" fc="#CCCCCC"/> | |
//if not said, it'll be id="1" fc="#E5E5E5" | |
switch (s.Attributes["p"].Value.ToString()) | |
{ | |
case "1": | |
sColor = "E5E5E5"; | |
break; | |
case "2": | |
sColor = "CCCCCC"; | |
break; | |
default: | |
sColor = "E5E5E5"; | |
break; | |
} | |
} | |
else | |
{ | |
if (ssFlag == 0) | |
sColor = sLineColor; | |
else | |
sColor = sLastColor; | |
} | |
sLastColor = sColor; | |
//s is the first word, WordStart should be LineStart. | |
//s maybe the only word. | |
//s maybe the final word. | |
sWordStart = p.Attributes["t"].Value; | |
if (s.NextSibling == null) | |
{ | |
if (psFlag == ps.Count - 1) | |
{ | |
//s is the final word, which means there is no more lines, WordEnd should be LineStart + Duration, which is a bit longer than normal ones. | |
sWordEnd = (Convert.ToInt32(p.Attributes["t"].Value) + Convert.ToInt32(p.Attributes["d"].Value)).ToString(); | |
} | |
else | |
{ | |
//s is the only word, but not the final word, WordEnd should be next LineStart. | |
sWordEnd = ps[psFlag + 1].Attributes["t"].Value; | |
} | |
} | |
else | |
{ | |
//s is not the only word, WordEnd should be LineStart + next WordStart. | |
sWordEnd = (Convert.ToInt32(p.Attributes["t"].Value) + Convert.ToInt32(ss[ssFlag + 1].Attributes["t"].Value)).ToString(); | |
} | |
} | |
else | |
{ | |
//s is not the first word, WordStart should be LineStart + TimeShift. | |
//s cannot be the only word. | |
//s maybe the final word. | |
sWordStart = (Convert.ToInt32(p.Attributes["t"].Value) + Convert.ToInt32(s.Attributes["t"].Value)).ToString(); | |
if (s.NextSibling == null) | |
{ | |
//s is the last word. | |
//s maybe the final word. | |
if (psFlag == ps.Count - 1) | |
{ | |
//s is the final word, which means there is no more lines, WordEnd should be LineStart + Duration, which is a bit longer than normal ones. | |
sWordEnd = (Convert.ToInt32(p.Attributes["t"].Value) + Convert.ToInt32(p.Attributes["d"].Value)).ToString(); | |
} | |
else | |
{ | |
//s is the last word, but not the final word, WordEnd should be next LineStart. | |
sWordEnd = ps[psFlag + 1].Attributes["t"].Value; | |
} | |
} | |
else | |
{ | |
//s is not the last word, WordEnd should be LineStart + next WordStart. | |
sWordEnd = (Convert.ToInt32(p.Attributes["t"].Value) + Convert.ToInt32(ss[ssFlag + 1].Attributes["t"].Value)).ToString(); | |
} | |
} | |
ssFlag++; | |
//Debug.WriteLine(Content + "\t" + WordStart + "\t" + WordEnd); | |
Debug.WriteLine(sContent + "\t" + msToTime(sWordStart) + "\t" + msToTime(sWordEnd) + "\t" + sPosition); | |
mySubtitle.Add(new subtitle { Content = sContent, WordStart = msToTime(sWordStart), WordEnd = msToTime(sWordEnd), Position = sPosition, Selected = false, Color = sColor, LineNumber = sLineNumber }); | |
sPosition = sPosition + sContent.Length + 1; | |
} | |
Debug.WriteLine(""); | |
} | |
} | |
} | |
else | |
{ | |
if (p.Attributes["t"] != null) | |
{ | |
//s could maybe [Music] or [Applause] | |
} | |
} | |
psFlag++; | |
} | |
} | |
} | |
private void processVTT() | |
{ | |
//when it's a timed line | |
//00:00:24.510 --> 00:00:27.170 align:start position:0% | |
//(\d{2}:\d{2}:\d{2}.\d{3}) $1 00:00:24.510 | |
//\s-->\s wasted " --> " | |
//(\d{2}:\d{2}:\d{2}.\d{3}) $2 00:00:27.170 | |
//" position:0%" wasted | |
string patternTime = @"(?<TimeStart>\d{2}:\d{2}:\d{2}.\d{3})\s-->\s(?<TimeEnd>\d{2}:\d{2}:\d{2}.\d{3}).*(?<NextStart>\d{2}:\d{2}:\d{2}.\d{3})\s-->\s(?<NextEnd>\d{2}:\d{2}:\d{2}.\d{3})"; | |
//when it's a content line | |
//another<00:21:54.539><c> one</c><00:21:54.809><c> on</c><00:21:54.960><c> top</c><00:21:55.639><c> by</c><00:21:56.639><c> the</c></c><c.colorCCCCCC><00:21:56.700><c> way</c></c><c.colorE5E5E5><00:21:56.940><c> double</c><00:21:57.840><c> it</c></c> | |
//<c.colorE5E5E5>close<00:20:24.640><c> it</c></c><c.colorCCCCCC><00:20:24.910><c> close</c><00:20:25.060><c> it</c></c><c.colorE5E5E5><00:20:25.180><c> close</c><00:20:25.570><c> it</c><00:20:25.690><c> and</c><00:20:26.110><c> so</c><00:20:26.740><c> and</c></c><c.colorCCCCCC><00:20:27.010><c> so</c></c> | |
//(?<Content>(\w+')?(\w+)) get Content | |
//(<\/?c.*?>)* drop </c></c.colorFFFFFF> | |
//<(?<WordEnd>\d{2}:\d{2}:\d{2}.\d{3})? get WordEnd have 0 or one time "00:21:54.539", if it's the end, there would be no time; | |
string patternContent = @"(<c.color(?<Color>.*?)>)?(?<Content>(\w+')?(\w+))(<\/c>){0,2}(<c.color(?<NextColor>.*?)>)?<(?<WordEnd>\d{2}:\d{2}:\d{2}.\d{3})?"; | |
string LineStart = ""; | |
string LineEnd = ""; | |
string LineNextStart = ""; | |
string LineNextEnd = ""; | |
string sContent = ""; | |
string sWordStart = ""; | |
string sWordEnd = ""; | |
string sColorNext = ""; | |
string sColorLast = ""; | |
string sColor = ""; | |
int sPosition = 0; | |
int sLineNumber = 0; | |
for (int i = 10; i < listBox1.Items.Count; i = i + 8) | |
{ | |
string str_time = listBox1.Items[i].ToString(); | |
string str_nextTime = ""; | |
string str_content = listBox1.Items[i + 2].ToString(); | |
if (i + 8 < listBox1.Items.Count) | |
str_nextTime = listBox1.Items[i + 8].ToString(); | |
else | |
str_nextTime = str_time; | |
strlist_import.Add(str_time + " " + str_nextTime); | |
strlist_import.Add(str_content); | |
} | |
foreach (var text in strlist_import) | |
{ | |
//match TimeLine | |
if (Regex.IsMatch(text, patternTime)) | |
{ | |
LineStart = Regex.Match(text, patternTime).Groups["TimeStart"].Value; | |
LineEnd = Regex.Match(text, patternTime).Groups["TimeEnd"].Value; | |
LineNextStart = Regex.Match(text, patternTime).Groups["NextStart"].Value; | |
LineNextEnd = Regex.Match(text, patternTime).Groups["NextEnd"].Value; | |
sWordStart = LineStart; | |
} | |
else | |
{ | |
if (Regex.IsMatch(text, patternContent)) | |
{ | |
int wordCount = 0; | |
sLineNumber++; | |
foreach (Match m in Regex.Matches(text, patternContent)) | |
{ | |
sContent = m.Groups["Content"].Value; | |
//youtube default color | |
//::cue(c.colorCCCCCC) { color: rgb(204, 204, 204); } <- Secondary | |
//::cue(c.colorE5E5E5) { color: rgb(229, 229, 229); } <- Default | |
if (m.Groups["Color"].Value == "") | |
{ | |
if (sColorNext == "") | |
sColor = "FFFFFF"; | |
else | |
sColor = sColorLast; | |
} | |
else | |
{ | |
sColor = m.Groups["Color"].Value; | |
} | |
//get next word color | |
if (m.Groups["NextColor"].Value == "") | |
{ | |
sColorNext = "FFFFFF"; ; | |
} | |
else | |
{ | |
sColorNext = m.Groups["NextColor"].Value; | |
} | |
//get next word appear time, or I call it WordEnd, in fact, youtube vtt don't support intervals between words. | |
if (m.Groups["WordEnd"].Value == "") | |
{ | |
sWordEnd = LineEnd; | |
} | |
else | |
{ | |
sWordEnd = m.Groups["WordEnd"].Value; | |
} | |
mySubtitle.Add(new subtitle | |
{ | |
Content = sContent, | |
WordStart = sWordStart, | |
WordEnd = sWordEnd, | |
NextStart = LineNextStart, | |
NextEnd = LineNextEnd, | |
Position = sPosition, | |
Selected = false, | |
Color = sColor, | |
ColorNext = sColorNext, | |
LineNumber = sLineNumber | |
}); | |
sPosition = sPosition + m.Groups["Content"].Value.Length + 1; | |
sWordStart = sWordEnd; | |
sColorLast = sColorNext; | |
wordCount++; | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment