Early detection of emerging food trends can translate into great business opportunities. Today, a lot of food-related discussions occur on social media platforms such as Twitter and Facebook. Thus, such social media content presents a potentially valuable and real-time source of intelligence that can be leveraged by retailers to better serve its customers.
The purpose of this project is to help retailers see the rise and fall of certain categories of food before competitors do.
LDA is one plausible way to discover the particular 'topic' which is ingredients from the facebook data through topic modeling. By ranking the frequency they appear togeter, we can know what kind of food is popular during certain period of time.
However, it would show plenty of popular words together at the same time. So, it is hard to detect the exact relationship of every two ingredients, which may make the food trend ambiguous.
# import useful packages
import os
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
# ignore some useless warnings
import warnings
def function():
warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
function()
# definitions of cooccurrence matrix
def cooccurrence_next_word( sentlist, targets ): # cooccurrence defined as right after the focal word
cooc = np.zeros((len(targets), len(targets)), np.float64)
w2i = {w:i for (i,w) in enumerate(targets,0)}
for num,sent in enumerate(sentlist,1):
words = sent.split()
n = len(words)
for i in range(n-1):
if(words[i] in w2i.keys() and words[i+1] in w2i.keys()):
cooc[w2i[words[i]], w2i[words[i+1]]] +=1
np.fill_diagonal(cooc, 0)
return cooc
def cooccurrence_same_sentence( sentlist, targets ): # cooccurrence defined as being in the same sentence
cooc = np.zeros((len(targets), len(targets)), np.float64)
w2i = {w:i for (i,w) in enumerate(targets,0)}
for num,sent in enumerate(sentlist,1):
words = sent.split()
n = len(words)
for i in range(n):
for j in range(i+1,n):
if(words[i] in w2i.keys() and words[j] in w2i.keys()):
cooc[w2i[words[i]], w2i[words[j]]] +=1
np.fill_diagonal(cooc, 0)
return cooc+cooc.T # necessary since symmetry is exploited to save computation during construction
def cooccurrence_symmetric_window( sentlist, targets, weights ): # cooccurrence based on weighted moving window
m = len(weights)
cooc = np.zeros((len(targets), len(targets)), np.float64)
w2i = {w:i for (i,w) in enumerate(targets,0)}
for num,sent in enumerate(sentlist,1):
words = sent.split()
n = len(words)
for i in range(n):
end = min(n-1, i+m)
for j in range(i+1, end+1):
if(words[i] in w2i.keys() and words[j] in w2i.keys()):
cooc[w2i[words[i]], w2i[words[j]]] += weights[j-i-1]
np.fill_diagonal(cooc, 0)
return cooc+cooc.T # necessary since symmetry is exploited to save computation during construction
# convert list to string
def listToString(s):
# initialize an empty string
str1 = ""
# traverse in the string
for ele in s:
str1 += ele
# return string
return str1
# load facebook data
import csv
file = []
data = []
for j in range(1,6):
for i in range(1,13):
file.append(open('/Users/apple/Downloads/fb201'+str(j)+'/fpost-201'+str(j)+'-'+str(i)+'.csv', 'r'))
for i in range(1,61):
data.append(file[i-1].readlines())
# load ingredient data
ingredients = open("/Users/apple/Desktop/ingredients.txt","r")
ingredients = ingredients.readlines()
# transform ingredient data by using word tokenization
ingredients = listToString([x.lower() for x in ingredients])
newing = nltk.word_tokenize(ingredients)
newing = list(set(list(set(newing))))
newing.remove("and")
newing.remove("of")
newing.remove("con")
newing.remove("de")
newing.remove("sel")
# get the cooccurence matrix
def getCooc(data):
symbols = listToString([x.lower() for x in data])
sentencelist = sent_tokenize( symbols )
sentencelist = [re.sub("[^a-zA-Z]", " ", x) for x in sentencelist]
cooc = cooccurrence_symmetric_window( sentencelist, newing, 1/np.arange(1,10) )
return cooc
# show the most popular pair of ingredients in each month
def getTrendList(data):
cooc = getCooc(data)
m = pd.DataFrame(cooc, index=newing, columns=newing)
m = m[~np.all(m == 0, axis=1)]
month = pd.DataFrame(m.stack()).reset_index()
month.columns = ['ing1','ing2','trend']
month = month.sort_values(by='trend', ascending = False)[month['trend']>=10][0::2]
month['food'] = month['ing1'] +' ,' + month['ing2']
month = month.drop(['ing1','ing2'],axis=1)
return month
cooc = []
for i in tqdm(range(0,60)):
cooc.append(getCooc(data[i]))
foodlist = np.stack(cooc)
# save the data
from tempfile import TemporaryFile
outfile = TemporaryFile()
x = foodlist
np.save(outfile, x)
foodindex = {w: i for (i,w) in enumerate(newing)}
# plot the trend
def plotTrend(f1,f2):
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(foodlist[:,foodindex[f1],foodindex[f2]])
ax.set_xticks(np.arange(60,step = 12))
ax.set_xticklabels(['2011-01','2012-01','2013-01','2014-01','2015-01'], fontsize=18)
ax.set_title('The Trend of '+ f1 +' and '+f2,fontsize=30)
ax.set_xlabel("Month",fontsize=20)
ax.set_ylabel("Number",fontsize=20)
By using the getTrendList function I created, we can take a look at the most popular food combinations in each month. The following shows the situation in November of 2011.
To celebrate thanks giving, I assume that people would mention some traditional food for tranks giving day in November. The trend list proves my anticipation:pumpkin pie and cranberry sauce are the most obvious ones.
# find out what kind of food combinations are popular in each month
getTrendList(data[10]).head(20)
plotTrend('pumpkin','pie')
plotTrend('cranberry','sauce')
plotTrend('squash','butternut')
plotTrend('turkey','breast')
plotTrend('vegetable','noodle')
plotTrend('cauliflower','rice')
plotTrend('chicken','soup')
plotTrend('beef','taco')
plotTrend('chocolate','cake')
plotTrend('ginger','garlic')
plotTrend('coconut','oil')