Last active
September 1, 2015 05:11
-
-
Save inkhorn/9044779 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
recipes = readLines('recipes combined.tsv') | |
# Once I read it into R, I have to get rid of the /t | |
# characters so that it's more acceptable to the tm package | |
recipes.new = apply(as.matrix(recipes), 1, function (x) gsub('\t',' ', x)) | |
recipes.corpus = Corpus(VectorSource(recipes.new)) | |
recipes.dtm = DocumentTermMatrix(recipes.corpus) | |
# Now I filter out any terms that have shown up in less than 10 documents | |
recipes.dict = Dictionary(findFreqTerms(recipes.dtm,10)) | |
recipes.dtm.filtered = DocumentTermMatrix(recipes.corpus, list(dictionary = recipes.dict)) | |
# Here I get a count of number of ingredients in each document | |
# with the intent of deleting any documents with 0 ingredients | |
ingredient.counts = apply(recipes.dtm.filtered, 1, function (x) sum(x)) | |
recipes.dtm.filtered = recipes.dtm.filtered[ingredient.counts > 0] | |
# Here i get some simple ingredient frequencies so that I can plot them and decide | |
# which I'd like to filter out | |
recipes.m = as.matrix(recipes.dtm.filtered) | |
popularity.of.ingredients = sort(colSums(recipes.m), decreasing=TRUE) | |
popularity.of.ingredients = data.frame(ingredients = names(popularity.of.ingredients), num_recipes=popularity.of.ingredients) | |
popularity.of.ingredients$ingredients = reorder(popularity.of.ingredients$ingredients, popularity.of.ingredients$num_recipes) | |
library(ggplot2) | |
ggplot(popularity.of.ingredients[1:30,], aes(x=ingredients, y=num_recipes)) + geom_point(size=5, colour="red") + coord_flip() + | |
ggtitle("Recipe Popularity of Top 30 Ingredients") + | |
theme(axis.text.x=element_text(size=13,face="bold", colour="black"), axis.text.y=element_text(size=13,colour="black", | |
face="bold"), axis.title.x=element_text(size=14, face="bold"), axis.title.y=element_text(size=14,face="bold"), | |
plot.title=element_text(size=24,face="bold")) | |
# Having found wheat, egg, and butter to be the three most frequent ingredients | |
# (and not caring too much about them as ingredients in general) I remove them | |
# from the corpus and redo the document term matrix | |
recipes.corpus = tm_map(recipes.corpus, removeWords, c("wheat","egg","butter")) # Go back to line 6 | |
recipes.dtm.final = DocumentTermMatrix(recipes.corpus, list(dictionary = recipes.dict)) | |
# Finally, I run the LDA and extract the 5 most | |
# characteristic ingredients in each topic... yummy! | |
recipes.lda = LDA(recipes.dtm.filtered, 50) | |
t = terms(recipes.lda,5) | |
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9 | |
[1,] "onion" "pepper" "milk" "tomato" "olive_oil" "milk" "milk" "tomato" "garlic" | |
[2,] "rice" "vinegar" "vanilla" "garlic" "garlic" "nutmeg" "pepper" "cayenne" "cream" | |
[3,] "cayenne" "onion" "cocoa" "oregano" "onion" "vanilla" "yeast" "olive_oil" "vegetable_oil" | |
[4,] "chicken_broth" "tomato" "onion" "onion" "black_pepper" "cinnamon" "potato" "garlic" "pepper" | |
[5,] "olive_oil" "milk" "cane_molasses" "basil" "vinegar" "cream" "lemon_juice" "pepper" "milk" | |
Topic 10 Topic 11 Topic 12 Topic 13 Topic 14 Topic 15 Topic 16 Topic 17 | |
[1,] "milk" "soy_sauce" "vegetable_oil" "onion" "milk" "tamarind" "milk" "vegetable_oil" | |
[2,] "cream" "scallion" "milk" "black_pepper" "cinnamon" "onion" "vanilla" "pepper" | |
[3,] "vanilla" "sesame_oil" "pepper" "vinegar" "onion" "garlic" "cream" "cream" | |
[4,] "cane_molasses" "cane_molasses" "cane_molasses" "bell_pepper" "cayenne" "corn" "vegetable_oil" "black_pepper" | |
[5,] "cinnamon" "roasted_sesame_seed" "cinnamon" "bacon" "olive_oil" "vinegar" "garlic" "mustard" | |
Topic 18 Topic 19 Topic 20 Topic 21 Topic 22 Topic 23 Topic 24 Topic 25 Topic 26 | |
[1,] "cane_molasses" "vanilla" "onion" "garlic" "onion" "vegetable_oil" "onion" "cream" "cumin" | |
[2,] "onion" "cream" "black_pepper" "cane_molasses" "garlic" "soy_sauce" "garlic" "tomato" "coriander" | |
[3,] "vinegar" "almond" "vegetable_oil" "vinegar" "tomato" "sesame_oil" "cane_molasses" "chicken" "turmeric" | |
[4,] "olive_oil" "coconut" "bell_pepper" "black_pepper" "olive_oil" "fish" "tomato" "lemon_juice" "fenugreek" | |
[5,] "pepper" "oat" "garlic" "soy_sauce" "basil" "chicken" "vegetable_oil" "black_pepper" "lemongrass" | |
Topic 27 Topic 28 Topic 29 Topic 30 Topic 31 Topic 32 Topic 33 Topic 34 Topic 35 | |
[1,] "onion" "onion" "onion" "onion" "vanilla" "garlic" "onion" "onion" "garlic" | |
[2,] "garlic" "vinegar" "celery" "pepper" "milk" "onion" "pepper" "garlic" "basil" | |
[3,] "black_pepper" "garlic" "chicken" "garlic" "garlic" "vegetable_oil" "garlic" "vegetable_oil" "pepper" | |
[4,] "tomato" "lemon_juice" "vegetable_oil" "parsley" "cinnamon" "cayenne" "black_pepper" "black_pepper" "tomato" | |
[5,] "olive_oil" "ginger" "carrot" "olive_oil" "cream" "beef" "beef" "chicken" "olive_oil" | |
Topic 36 Topic 37 Topic 38 Topic 39 Topic 40 Topic 41 Topic 42 Topic 43 Topic 44 | |
[1,] "onion" "onion" "onion" "cayenne" "garlic" "vanilla" "vanilla" "scallion" "milk" | |
[2,] "garlic" "garlic" "cream" "garlic" "onion" "cocoa" "cane_molasses" "garlic" "tomato" | |
[3,] "cayenne" "black_pepper" "tomato" "ginger" "bell_pepper" "milk" "cocoa" "ginger" "garlic" | |
[4,] "vegetable_oil" "lemon_juice" "cane_molasses" "rice" "olive_oil" "cinnamon" "oat" "soybean" "vegetable_oil" | |
[5,] "oregano" "scallion" "milk" "onion" "milk" "walnut" "milk" "pepper" "cream" | |
Topic 45 Topic 46 Topic 47 Topic 48 Topic 49 Topic 50 | |
[1,] "onion" "cream" "pepper" "cream" "milk" "olive_oil" | |
[2,] "cream" "black_pepper" "vegetable_oil" "tomato" "vanilla" "tomato" | |
[3,] "black_pepper" "chicken_broth" "garlic" "beef" "lard" "parmesan_cheese" | |
[4,] "milk" "vegetable_oil" "onion" "garlic" "cocoa" "lemon_juice" | |
[5,] "cinnamon" "garlic" "olive_oil" "carrot" "cane_molasses" "garlic" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment