Skip to content

Instantly share code, notes, and snippets.

@Sofronio
Created May 5, 2018 04:50
Show Gist options
  • Save Sofronio/1d1bf8d9d5bcc59fb1b88fc3e9c0d150 to your computer and use it in GitHub Desktop.
Save Sofronio/1d1bf8d9d5bcc59fb1b88fc3e9c0d150 to your computer and use it in GitHub Desktop.
YouTube2Subtitle
private void processXML(string filesWithPath)
{
if (filesWithPath != "")
{
mySubtitle.Clear();
XmlDocument myXMLDoc = new XmlDocument();
myXMLDoc.Load(filesWithPath);
var ps = myXMLDoc.GetElementsByTagName("p");
//each p is a TimedLine
//p with content, contains p.LineStart, p.LineEnd is p.Next.LineStart.
//p without content, contains only p.LineStart, providing p.Pre.LineEnd.
//p with content could be the last line.
//p without content could not be the last line. 2018-05-02
int psFlag = 0;
int sPosition = 0;
string sContent;
string sWordStart;
string sWordEnd;
int sLineNumber = 0;
string sColor = "";
string sLastColor = "";
string sLineColor = "";
foreach (XmlNode p in ps)
{
if (p.ChildNodes.Count != 0)
{
//Debug.WriteLine(p.InnerXml);
sLineNumber++;
if (p.InnerXml.Contains("<"))
{
if (p.Attributes["t"] != null)
{
//each s is a content, which may contain "t" as s.WordStart
//the first s in p doesn't contain "t", so ss[0].WordStart is LineStart
//other s contains "t", as WordStart
//s could be the last in p, s.WordEnd is p.LineEnd, which is p.Next.LineStart
if (p.Attributes["p"] != null)
{
//<pen id="1" fc="#E5E5E5"/>
//<pen id="2" fc="#CCCCCC"/>
//if not said, it'll be id="1" fc="#E5E5E5"
switch (p.Attributes["p"].Value.ToString())
{
case "1":
sLineColor = "E5E5E5";
break;
case "2":
sLineColor = "CCCCCC";
break;
default:
sLineColor = "E5E5E5";
break;
}
}
else
sLineColor = "E5E5E5";
var ss = p.ChildNodes;
int ssFlag = 0;
foreach (XmlNode s in ss)
{
sContent = s.InnerText.Trim();
if (s.Attributes["t"] == null)
{
if (s.Attributes["p"] != null)
{
//<pen id="1" fc="#E5E5E5"/>
//<pen id="2" fc="#CCCCCC"/>
//if not said, it'll be id="1" fc="#E5E5E5"
switch (s.Attributes["p"].Value.ToString())
{
case "1":
sColor = "E5E5E5";
break;
case "2":
sColor = "CCCCCC";
break;
default:
sColor = "E5E5E5";
break;
}
}
else
{
if (ssFlag == 0)
sColor = sLineColor;
else
sColor = sLastColor;
}
sLastColor = sColor;
//s is the first word, WordStart should be LineStart.
//s maybe the only word.
//s maybe the final word.
sWordStart = p.Attributes["t"].Value;
if (s.NextSibling == null)
{
if (psFlag == ps.Count - 1)
{
//s is the final word, which means there is no more lines, WordEnd should be LineStart + Duration, which is a bit longer than normal ones.
sWordEnd = (Convert.ToInt32(p.Attributes["t"].Value) + Convert.ToInt32(p.Attributes["d"].Value)).ToString();
}
else
{
//s is the only word, but not the final word, WordEnd should be next LineStart.
sWordEnd = ps[psFlag + 1].Attributes["t"].Value;
}
}
else
{
//s is not the only word, WordEnd should be LineStart + next WordStart.
sWordEnd = (Convert.ToInt32(p.Attributes["t"].Value) + Convert.ToInt32(ss[ssFlag + 1].Attributes["t"].Value)).ToString();
}
}
else
{
//s is not the first word, WordStart should be LineStart + TimeShift.
//s cannot be the only word.
//s maybe the final word.
sWordStart = (Convert.ToInt32(p.Attributes["t"].Value) + Convert.ToInt32(s.Attributes["t"].Value)).ToString();
if (s.NextSibling == null)
{
//s is the last word.
//s maybe the final word.
if (psFlag == ps.Count - 1)
{
//s is the final word, which means there is no more lines, WordEnd should be LineStart + Duration, which is a bit longer than normal ones.
sWordEnd = (Convert.ToInt32(p.Attributes["t"].Value) + Convert.ToInt32(p.Attributes["d"].Value)).ToString();
}
else
{
//s is the last word, but not the final word, WordEnd should be next LineStart.
sWordEnd = ps[psFlag + 1].Attributes["t"].Value;
}
}
else
{
//s is not the last word, WordEnd should be LineStart + next WordStart.
sWordEnd = (Convert.ToInt32(p.Attributes["t"].Value) + Convert.ToInt32(ss[ssFlag + 1].Attributes["t"].Value)).ToString();
}
}
ssFlag++;
//Debug.WriteLine(Content + "\t" + WordStart + "\t" + WordEnd);
Debug.WriteLine(sContent + "\t" + msToTime(sWordStart) + "\t" + msToTime(sWordEnd) + "\t" + sPosition);
mySubtitle.Add(new subtitle { Content = sContent, WordStart = msToTime(sWordStart), WordEnd = msToTime(sWordEnd), Position = sPosition, Selected = false, Color = sColor, LineNumber = sLineNumber });
sPosition = sPosition + sContent.Length + 1;
}
Debug.WriteLine("");
}
}
}
else
{
if (p.Attributes["t"] != null)
{
//s could maybe [Music] or [Applause]
}
}
psFlag++;
}
}
}
private void processVTT()
{
//when it's a timed line
//00:00:24.510 --> 00:00:27.170 align:start position:0%
//(\d{2}:\d{2}:\d{2}.\d{3}) $1 00:00:24.510
//\s-->\s wasted " --> "
//(\d{2}:\d{2}:\d{2}.\d{3}) $2 00:00:27.170
//" position:0%" wasted
string patternTime = @"(?<TimeStart>\d{2}:\d{2}:\d{2}.\d{3})\s-->\s(?<TimeEnd>\d{2}:\d{2}:\d{2}.\d{3}).*(?<NextStart>\d{2}:\d{2}:\d{2}.\d{3})\s-->\s(?<NextEnd>\d{2}:\d{2}:\d{2}.\d{3})";
//when it's a content line
//another<00:21:54.539><c> one</c><00:21:54.809><c> on</c><00:21:54.960><c> top</c><00:21:55.639><c> by</c><00:21:56.639><c> the</c></c><c.colorCCCCCC><00:21:56.700><c> way</c></c><c.colorE5E5E5><00:21:56.940><c> double</c><00:21:57.840><c> it</c></c>
//<c.colorE5E5E5>close<00:20:24.640><c> it</c></c><c.colorCCCCCC><00:20:24.910><c> close</c><00:20:25.060><c> it</c></c><c.colorE5E5E5><00:20:25.180><c> close</c><00:20:25.570><c> it</c><00:20:25.690><c> and</c><00:20:26.110><c> so</c><00:20:26.740><c> and</c></c><c.colorCCCCCC><00:20:27.010><c> so</c></c>
//(?<Content>(\w+')?(\w+)) get Content
//(<\/?c.*?>)* drop </c></c.colorFFFFFF>
//<(?<WordEnd>\d{2}:\d{2}:\d{2}.\d{3})? get WordEnd have 0 or one time "00:21:54.539", if it's the end, there would be no time;
string patternContent = @"(<c.color(?<Color>.*?)>)?(?<Content>(\w+')?(\w+))(<\/c>){0,2}(<c.color(?<NextColor>.*?)>)?<(?<WordEnd>\d{2}:\d{2}:\d{2}.\d{3})?";
string LineStart = "";
string LineEnd = "";
string LineNextStart = "";
string LineNextEnd = "";
string sContent = "";
string sWordStart = "";
string sWordEnd = "";
string sColorNext = "";
string sColorLast = "";
string sColor = "";
int sPosition = 0;
int sLineNumber = 0;
for (int i = 10; i < listBox1.Items.Count; i = i + 8)
{
string str_time = listBox1.Items[i].ToString();
string str_nextTime = "";
string str_content = listBox1.Items[i + 2].ToString();
if (i + 8 < listBox1.Items.Count)
str_nextTime = listBox1.Items[i + 8].ToString();
else
str_nextTime = str_time;
strlist_import.Add(str_time + " " + str_nextTime);
strlist_import.Add(str_content);
}
foreach (var text in strlist_import)
{
//match TimeLine
if (Regex.IsMatch(text, patternTime))
{
LineStart = Regex.Match(text, patternTime).Groups["TimeStart"].Value;
LineEnd = Regex.Match(text, patternTime).Groups["TimeEnd"].Value;
LineNextStart = Regex.Match(text, patternTime).Groups["NextStart"].Value;
LineNextEnd = Regex.Match(text, patternTime).Groups["NextEnd"].Value;
sWordStart = LineStart;
}
else
{
if (Regex.IsMatch(text, patternContent))
{
int wordCount = 0;
sLineNumber++;
foreach (Match m in Regex.Matches(text, patternContent))
{
sContent = m.Groups["Content"].Value;
//youtube default color
//::cue(c.colorCCCCCC) { color: rgb(204, 204, 204); } <- Secondary
//::cue(c.colorE5E5E5) { color: rgb(229, 229, 229); } <- Default
if (m.Groups["Color"].Value == "")
{
if (sColorNext == "")
sColor = "FFFFFF";
else
sColor = sColorLast;
}
else
{
sColor = m.Groups["Color"].Value;
}
//get next word color
if (m.Groups["NextColor"].Value == "")
{
sColorNext = "FFFFFF"; ;
}
else
{
sColorNext = m.Groups["NextColor"].Value;
}
//get next word appear time, or I call it WordEnd, in fact, youtube vtt don't support intervals between words.
if (m.Groups["WordEnd"].Value == "")
{
sWordEnd = LineEnd;
}
else
{
sWordEnd = m.Groups["WordEnd"].Value;
}
mySubtitle.Add(new subtitle
{
Content = sContent,
WordStart = sWordStart,
WordEnd = sWordEnd,
NextStart = LineNextStart,
NextEnd = LineNextEnd,
Position = sPosition,
Selected = false,
Color = sColor,
ColorNext = sColorNext,
LineNumber = sLineNumber
});
sPosition = sPosition + m.Groups["Content"].Value.Length + 1;
sWordStart = sWordEnd;
sColorLast = sColorNext;
wordCount++;
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment