Created
November 23, 2012 05:58
-
-
Save thomasjungblut/4134189 to your computer and use it in GitHub Desktop.
Bacon Generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package de.jungblut.bacon; | |
import java.io.BufferedReader; | |
import java.io.BufferedWriter; | |
import java.io.FileInputStream; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
import java.util.Set; | |
import java.util.regex.Pattern; | |
import com.google.common.collect.HashMultimap; | |
/** | |
* Graph generator for Apache Hama's SSSP. Prints out a tab separated text file | |
* of adjacent actors and a lookup file for ids of the actors. | |
* | |
* @author thomas.jungblut | |
* | |
*/ | |
public final class GraphGen { | |
private static final String IMDB_FILES_DIR = "files/imdb/"; | |
private static final String GRAPH_IN_ACTORS_TXT = "files/imdb/graph-in/actors.txt"; | |
private static final String GRAPH_IN_ADJACENT_ACTORS_TXT = "files/imdb/graph-in/adjacent_actors.txt"; | |
private static final String START_LINE = "----\t\t\t------"; | |
private static final String END_LINE = "-----------------------------------------------------------------------------"; | |
private static final Pattern SPLIT_PATTERN = Pattern.compile("\t\t\t"); | |
private static final Pattern ALTERNATIVE_SPLIT_PATTERN = Pattern | |
.compile("\t\t"); | |
public static void main(String[] args) { | |
// movie -> actors | |
HashMultimap<String, String> collaborationMap = HashMultimap.create(); | |
// actor -> movies | |
HashMultimap<String, String> actorMap = HashMultimap.create(); | |
readActors(collaborationMap, actorMap, "actors.list"); | |
readActors(collaborationMap, actorMap, "actresses.list"); | |
// now we loop over all actors and determine their collaborators | |
Set<String> actorSet = actorMap.keySet(); | |
String[] actorArray = actorSet.toArray(new String[actorSet.size()]); | |
actorSet = null; | |
Arrays.sort(actorArray); | |
System.out.println(actorArray.length); | |
try (BufferedWriter bw = new BufferedWriter(new FileWriter( | |
GRAPH_IN_ADJACENT_ACTORS_TXT))) { | |
for (int i = 0; i < actorArray.length; i++) { | |
Set<String> movies = actorMap.get(actorArray[i]); | |
Set<String> adjacentActors = new HashSet<>(); | |
for (String movie : movies) { | |
// TODO actually you could save the movie over which they are adjacent | |
// (edge value) | |
adjacentActors.addAll(collaborationMap.get(movie)); | |
} | |
adjacentActors.remove(actorArray[i]); | |
StringBuilder sb = new StringBuilder(); | |
sb.append(i); | |
sb.append('\t'); | |
for (String adjacent : adjacentActors) { | |
sb.append(Arrays.binarySearch(actorArray, adjacent)); | |
sb.append('\t'); | |
} | |
sb.append('\n'); | |
bw.write(sb.toString()); | |
if (i % 10000 == 0) { | |
System.out.println(i); | |
} | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
System.out.println("Flush actor lookups"); | |
try (BufferedWriter bw = new BufferedWriter(new FileWriter( | |
GRAPH_IN_ACTORS_TXT))) { | |
for (int i = 0; i < actorArray.length; i++) { | |
bw.write(i + "\t" + actorArray[i] + "\n"); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
public static void readActors(HashMultimap<String, String> collaborationMap, | |
HashMultimap<String, String> actorMap, String fileName) { | |
try (BufferedReader br = new BufferedReader(new InputStreamReader( | |
new FileInputStream(IMDB_FILES_DIR + fileName), "ISO8859-1"))) { | |
boolean start = false; | |
String line = null; | |
String currentActor = null; | |
while ((line = br.readLine()) != null) { | |
if (!start) { | |
if (line.equals(START_LINE)) { | |
start = true; | |
} | |
} else { | |
if (line.isEmpty()) { | |
currentActor = null; | |
} else if (line.equals(END_LINE)) { | |
break; | |
} else { | |
// really? who has implemented this crazy format? | |
String[] split = SPLIT_PATTERN.split(line); | |
if (split.length != 2) { | |
split = ALTERNATIVE_SPLIT_PATTERN.split(line); | |
if (split.length != 2) { | |
split = line.split("\t"); | |
} | |
} | |
if (currentActor == null) { | |
currentActor = split[0]; | |
} | |
if (split.length < 2) { | |
System.out.println("Couldn't parse line correctly: " + line); | |
continue; | |
} | |
String normalizedMovieName = split[1].substring(0, | |
split[1].indexOf(")") + 1); | |
collaborationMap.put(normalizedMovieName, currentActor); | |
actorMap.put(currentActor, normalizedMovieName); | |
} | |
} | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment