Created
March 4, 2021 00:23
-
-
Save seong-min-s/8b375948cc5c00616d5fcf71f52ec09a to your computer and use it in GitHub Desktop.
전처리 코드
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.IO; | |
using System.Data; | |
using HtmlAgilityPack; | |
namespace Preprocessing | |
{ | |
class Program | |
{ | |
static string RtfToHtml(string path) | |
{ | |
SautinSoft.RtfToHtml r = new SautinSoft.RtfToHtml(); | |
string rtfString = File.ReadAllText(path); | |
r.ImageStyle.IncludeImageInHtml = true; | |
string htmlString = r.ConvertString(rtfString); | |
return htmlString; | |
} | |
static void HtmlToCsv(string html, string path) | |
{ | |
HtmlDocument htmlDoc = new HtmlDocument(); | |
htmlDoc.LoadHtml(html); | |
HtmlNodeCollection tableNode1 = htmlDoc.DocumentNode.SelectNodes("//table"); | |
Console.WriteLine(tableNode1.Count); | |
HtmlNode tableNode = htmlDoc.DocumentNode.SelectSingleNode("//table"); | |
HtmlNodeCollection trNodes = tableNode.SelectNodes("tr"); | |
DataTable dataTable = new DataTable(); | |
DataRow row = null; | |
for (int i = 0; i < trNodes.Count; i++) | |
{ | |
HtmlNodeCollection tdNodes = trNodes[i].SelectNodes("td"); | |
if (i == 0) | |
{ | |
for (int j = 0; j < tdNodes.Count; j++) | |
{ | |
string columnName = tdNodes[j].InnerText; | |
columnName = columnName.Replace('\n', ' '); | |
columnName = columnName.Replace('\r', ' '); | |
columnName = columnName.Replace("μ", "m"); | |
dataTable.Columns.Add(new DataColumn(columnName, typeof(string))); | |
} | |
} | |
else | |
{ | |
row = dataTable.NewRow(); | |
for (int j = 0; j < tdNodes.Count; j++) | |
{ | |
row[j] = tdNodes[j].InnerText; | |
} | |
dataTable.Rows.Add(row); | |
} | |
} | |
for (int i = 0; i < dataTable.Rows.Count; i++) | |
{ | |
for (int j = 0; j < dataTable.Columns.Count; j++) | |
{ | |
Console.WriteLine(dataTable.Rows[i][j].ToString()); | |
} | |
Console.WriteLine("====="); | |
} | |
ExportToCSV(dataTable, path); | |
} | |
static void ExportToCSV(DataTable dtDataTable, string strFilePath) | |
{ | |
StreamWriter sw = new StreamWriter(strFilePath, false, System.Text.Encoding.Default); | |
//headers | |
for (int i = 0; i < dtDataTable.Columns.Count; i++) | |
{ | |
sw.Write(dtDataTable.Columns[i].ToString().Trim()); | |
if (i < dtDataTable.Columns.Count - 1) | |
{ | |
sw.Write(","); | |
} | |
} | |
sw.Write(sw.NewLine); | |
foreach (DataRow dr in dtDataTable.Rows) | |
{ | |
for (int i = 0; i < dtDataTable.Columns.Count; i++) | |
{ | |
if (!Convert.IsDBNull(dr[i])) | |
{ | |
string value = dr[i].ToString().Trim(); | |
if (value.Contains(',')) | |
{ | |
value = String.Format("\"{0}\"", value); | |
sw.Write(value); | |
} | |
else | |
{ | |
sw.Write(dr[i].ToString().Trim()); | |
} | |
} | |
if (i < dtDataTable.Columns.Count - 1) | |
{ | |
sw.Write(","); | |
} | |
} | |
sw.Write(sw.NewLine); | |
} | |
sw.Close(); | |
} | |
static void Main(string[] args) | |
{ | |
/* path sample = @"c:\filename.rtf */ | |
string a = Program.RtfToHtml(path sample); | |
HtmlToCsv(a); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment