Created
August 8, 2012 21:39
-
-
Save emeeks/3299027 to your computer and use it in GitHub Desktop.
Create Node and Edge Tables from MALLET Topic-Modeling Output
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
header("Content-type: text/html; charset=utf-8"); | |
$link = mysql_connect("localhost","gephi","gephi"); | |
mysql_select_db("dh_stanford"); | |
mysql_set_charset('utf8'); | |
echo ("Start-"); | |
echo date(DATE_RFC822); | |
echo ("<br>"); | |
ini_set('memory_limit', '1024M'); | |
//Thresholds for word incidence and topic connection levels | |
$word_threshold = 1; | |
$topic_threshold = .01; | |
//Initialize the two files we're working with | |
$topicDocFile = "topic_to_doc.txt"; | |
$wordTopicFile = "word_to_topic.txt"; | |
//unlink($myFile); | |
$fh = fopen($topicDocFile, 'r') or die("can't open file"); | |
echo $topicDocFile; | |
echo ("<br>"); | |
//It might run a while | |
set_time_limit(0); | |
//Set up the tables | |
/* | |
$query = " | |
DROP TABLE IF EXISTS `raw`; | |
CREATE TABLE `raw` ( | |
`id` int(11) NOT NULL AUTO_INCREMENT, | |
`source` int(11) DEFAULT NULL, | |
`target` int(11) DEFAULT NULL, | |
`weight` decimal(5,4) DEFAULT NULL, | |
`incidence` int(11) DEFAULT NULL, | |
PRIMARY KEY (`id`) | |
) ENGINE=MyISAM AUTO_INCREMENT=49801 DEFAULT CHARSET=utf8 ROW_FORMAT=FIXED; | |
DROP TABLE IF EXISTS `raw_nodes`; | |
CREATE TABLE `raw_nodes` ( | |
`ID` int(11) NOT NULL DEFAULT '0', | |
`LABEL` varchar(255) DEFAULT NULL, | |
`TYPE` varchar(255) DEFAULT NULL, | |
PRIMARY KEY (`ID`) | |
) ENGINE=MyISAM DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC; | |
"; | |
$write = mysql_query($query); | |
*/ | |
//READ HEADER and throw it away | |
//This is the "#doc source topic proportion line and it's worthless | |
$buffer = fgets($fh); | |
//COMPUTE LENGTH | |
$splitcontents = explode($topicDocFile, $buffer); | |
$book_id = 0; | |
$book_label = "blank"; | |
$delimiter = "\t"; | |
while (!feof($fh)) { | |
$buffer = fgets($fh); | |
$counter = 0; | |
$splitcontents = explode($delimiter, $buffer); | |
$x = 0; | |
while ($x < count($splitcontents)) { | |
//GET ID | |
// The first column is your book's numerical ID | |
if ($x == 0) { | |
$book_id = $splitcontents[$x]; | |
} | |
// The second column is the book's label | |
if ($x == 1) { | |
// If your book label is legible, uncomment this | |
//$book_label = $splitcontents[$x]; | |
//else just concatenate "book" and the id value | |
$book_label = "Book " . $book_id; | |
$query = ' | |
INSERT | |
INTO | |
raw_nodes | |
(`ID`, `LABEL`, `TYPE`) | |
VALUES | |
('.$book_id.', "'.$book_label.'", "BOOK") | |
'; | |
$write = mysql_query($query); | |
} | |
else if ($x > 1) { | |
$adjusted_topic = $splitcontents[$x] + 4000000; | |
if ($splitcontents[$x+1] >= $topic_threshold) { | |
$query = ' | |
INSERT | |
INTO | |
raw | |
(`id`, `source`, `target`, `weight`, `incidence`) | |
VALUES | |
(NULL, '.$book_id.', '.$adjusted_topic.', '.$splitcontents[$x+1].', 0) | |
'; | |
$write = mysql_query($query); | |
} | |
//Double increment because we're looking at topic_weight pairs | |
$x++; | |
} | |
$x++; | |
} | |
} | |
echo("Finished-"); | |
echo date(DATE_RFC822); | |
fclose($fh); | |
//Create Topic Nodes | |
$query = ' | |
INSERT INTO | |
raw_nodes | |
SELECT DISTINCT | |
raw.target, | |
CONCAT("Topic ", (raw.target - 4000000)), | |
"TOPIC" | |
FROM | |
raw | |
'; | |
$write = mysql_query($query); | |
//Word to Topic links | |
$fh = fopen($wordTopicFile, 'r') or die("can't open file"); | |
$word_id = 0; | |
$word_label = "blank"; | |
$delimiter = " "; | |
while (!feof($fh)) { | |
$buffer = fgets($fh); | |
$counter = 0; | |
$splitcontents = explode($delimiter, $buffer); | |
$x = 0; | |
while ($x < count($splitcontents)) { | |
//GET ID | |
/* The first column is MALLET's numerical ID for its words, we're going to modify this so as not to | |
interfere with our unique IDs for topics or books | |
*/ | |
if ($x == 0) { | |
$word_id = $splitcontents[$x] + 2000000; | |
} | |
// The second column is the word | |
if ($x == 1) { | |
$word_label = $splitcontents[$x]; | |
$query = ' | |
INSERT | |
INTO | |
raw_nodes | |
(`ID`, `LABEL`, `TYPE`) | |
VALUES | |
('.$word_id.', "'.$word_label.'", "WORD") | |
'; | |
$write = mysql_query($query); | |
} | |
else if ($x > 1) { | |
//This time MALLET presents us with ratio pairs, so we need to split those. | |
$splitpair = explode(":", $splitcontents[$x]); | |
$adjusted_topic = $splitpair[0] + 4000000; | |
if( $splitpair[1] >= $word_threshold) { | |
$query = ' | |
INSERT | |
INTO | |
raw | |
(`id`, `source`, `target`, `weight`, `incidence`) | |
VALUES | |
(NULL, '.$word_id.', '.$adjusted_topic.', 1, '.$splitpair[1].') | |
'; | |
$write = mysql_query($query); | |
//Double increment because we're looking at topic_weight pairs | |
} | |
// $x++; | |
} | |
$x++; | |
} | |
} | |
mysql_close($link); | |
fclose($fh); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment