An Analysis of Red Hot Chili Pepper’s Lyrics Using NLP

source
# sort albums by year
albums = sorted([d for d in os.listdir(os.curdir) if os.path.isdir(d) and d.endswith(‘)’)],
key=lambda x: re.search(“(\d{4})”, x).group(0))
albums
['The Red Hot Chili Peppers (1984)',
'Freaky Styley (1985)',
'The Uplift Mofo Party Plan (1987)',
"Mother's Milk (1989)",
'Blood Sugar Sex Magik (1991)',
'One Hot Minute (1995)',
'Californication (1999)',
'By The Way (2002)',
'Stadium Arcadium (2006)',
"I'm With You (2011)",
"I'm With You Sessions (2013)",
'The Getaway (2016)']
tfidf = TfidfVectorizer(preprocessor=None, stop_words=None)
X = tfidf.fit_transform(tfidf_corpus)
X.A.shape
(12, 4007)
vocab = {v: k for k, v in tfidf.vocabulary_.items()}
vocab = sorted(vocab.items(), key=lambda x: x[1])
vocab_df = pd.DataFrame(X.A, columns=[c[1] for c in vocab], index=albums)
for i,a in enumerate(albums):
largest = vocab_df.iloc[i].nlargest(5)
RHCP.loc[a, ‘tfidf:word #1’] = largest.index[0]
RHCP.loc[a, ‘tfidf:word #2’] = largest.index[1]
RHCP.loc[a, ‘tfidf:word #3’] = largest.index[2]
RHCP.loc[a, ‘tfidf:word #4’] = largest.index[3]
RHCP.loc[a, ‘tfidf:word #5’] = largest.index[4]
RHCP.iloc[:,3:]
from gensim.corpora.dictionary import Dictionary
import gensim
lda_corpus = []
for album in lyrics_per_album:
for song in album:
lda_corpus.append(preprocess(song))
bigram = gensim.models.Phrases(lda_corpus, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[lda_corpus], min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
lda_corpus = [trigram_mod[bigram_mod[doc]] for doc in lda_corpus]
dictionary = Dictionary(lda_corpus)
bow_corpus = [dictionary.doc2bow(text) for text in lda_corpus]
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel
model = LdaMulticore(bow_corpus, num_topics=12, id2word=dictionary, workers=7, passes=10, per_word_topics=True,
random_state=1)
coherence_model = CoherenceModel(model=model, corpus=bow_corpus, texts=lda_corpus, coherence=’c_v’)
print(coherence_model.get_coherence())
0.33568880799556466
from pprint import pprintpprint(model.print_topics())[(0,
'0.020*"get" + 0.017*"love" + 0.016*"like" + 0.012*"take" + 0.011*"come" + '
'0.010*"could" + 0.009*"another" + 0.008*"make" + 0.007*"back" + 0.007*"go"'),
(1,
'0.022*"say" + 0.020*"know" + 0.018*"get" + 0.017*"go" + 0.016*"yeah" + '
'0.015*"come" + 0.014*"hey" + 0.013*"want" + 0.011*"oh" + 0.011*"like"'),
(2,
'0.046*"love" + 0.014*"make" + 0.014*"see" + 0.012*"around" + 0.010*"get" + '
'0.010*"want" + 0.009*"know" + 0.009*"way" + 0.008*"never" + 0.008*"go"'),
(3,
'0.022*"get" + 0.017*"know" + 0.014*"come" + 0.013*"want" + 0.013*"girl" + '
'0.012*"go" + 0.011*"like" + 0.009*"friend" + 0.008*"call" + 0.008*"walk"'),
(4,
'0.054*"get" + 0.013*"know" + 0.011*"give" + 0.010*"say" + 0.010*"tell" + '
'0.009*"right" + 0.009*"one" + 0.009*"love" + 0.009*"come" + 0.009*"make"'),
(5,
'0.029*"around" + 0.023*"look" + 0.018*"please" + 0.011*"know" + 0.010*"get" '
'+ 0.009*"way" + 0.009*"never" + 0.009*"torture" + 0.009*"far" + '
'0.008*"like"'),
(6,
'0.026*"get" + 0.020*"turn" + 0.014*"well" + 0.012*"yeah" + 0.009*"yes" + '
'0.009*"sale" + 0.008*"like" + 0.008*"blood" + 0.008*"true_men_kill_coyote" '
'+ 0.008*"know"'),
(7,
'0.014*"love" + 0.013*"time" + 0.012*"like" + 0.012*"oh" + 0.012*"get" + '
'0.011*"know" + 0.011*"come" + 0.010*"go" + 0.009*"sing" + 0.008*"want"'),
(8,
'0.020*"know" + 0.017*"get" + 0.015*"let" + 0.011*"time" + 0.010*"go" + '
'0.009*"man" + 0.009*"like" + 0.008*"say" + 0.008*"want" + 0.008*"girl"'),
(9,
'0.020*"get" + 0.015*"long" + 0.014*"make" + 0.013*"come" + 0.012*"gon_na" + '
'0.012*"time" + 0.009*"fall" + 0.009*"baby" + 0.009*"know" + 0.008*"wo"'),
(10,
'0.028*"want" + 0.024*"party_pussy" + 0.021*"yeah" + 0.020*"baby" + '
'0.019*"good" + 0.013*"get" + 0.011*"say" + 0.008*"make" + 0.008*"take" + '
'0.007*"god"'),
(11,
'0.029*"away" + 0.018*"take" + 0.017*"like" + 0.016*"get" + 0.015*"know" + '
'0.012*"make" + 0.011*"never" + 0.009*"thing" + 0.008*"see" + 0.008*"say"')]
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
space = [
Integer(3, 20, name=”num_topics”),
Real(0.01, 1, name=’alpha’),
Real(0.01, 1, name=’eta’),
]
@use_named_args(space)
def objective(**params):
model = LdaMulticore(bow_corpus, id2word=dictionary, workers=7, passes=30, per_word_topics=True,
random_state=1,
num_topics=params[‘num_topics’],
alpha=params[‘alpha’],
eta=params[‘eta’])
coherence_model = CoherenceModel(model=model, corpus=bow_corpus, texts=lda_corpus, coherence=’c_v’)
return -coherence_model.get_coherence()
res_gp = gp_minimize(objective, space, n_calls=100,n_jobs=-1, verbose=True)print(“Best coherence: {:.2f}”.format(res_gp.fun))
print(“Best parameters: {}, {:.2f}, {:.2f}”.format(res_gp.x[0], res_gp.x[1], res_gp.x[2]))
Best coherence: -0.58
Best parameters: 16, 0.85, 1.00
(0,'0.016*"know" + 0.016*"get" + 0.015*"love" + 0.014*"like" + 0.011*"make" + '
'0.011*"want" + 0.010*"say" + 0.010*"come" + 0.009*"go" + 0.009*"take"')
(4,'0.032*"get" + 0.009*"time" + 0.008*"love" + 0.007*"come" + 0.007*"give" + '
'0.006*"know" + 0.006*"good" + 0.006*"tell" + 0.006*"say" + 0.005*"right"')
(11,
'0.003*"king" + 0.003*"minor" + 0.003*"repeat" + 0.002*"backwoods" + '
'0.002*"yertle_turtle" + 0.002*"otherside" + 0.002*"throw" + '
'0.002*"television" + 0.002*"turtle" + '
'0.002*"dream_californication_dream_californication"')
(13,
'0.004*"blood" + 0.004*"well" + 0.003*"true_men_kill_coyote" + 0.003*"yeah" + '
'0.002*"feel" + 0.002*"true_men" + 0.002*"gon_na" + 0.002*"hollywood_hill" + '
'0.002*"dig_dirt_dig_dust" + 0.002*"paisley_dragon_hollywood_hill"')
(2,
'0.002*"stretch" + 0.001*"love" + 0.001*"dirty" + 0.001*"stain" + '
'0.001*"bird" + 0.001*"thirty" + 0.001*"earthworm" + 0.001*"burping" + '
'0.001*"chirp" + 0.001*"curb"')
scat_new = scat.copy()
scat_new.loc[:, ‘Label’] = “None”
scat_new.loc[89, ‘Label’] = ‘Californication’
scat_new.loc[109, ‘Label’] = ‘By The Way’
scat_new.loc[131, ‘Label’] = ‘Dani California’
fig, ax = plt.subplots(figsize=(10,10))
ax = sns.scatterplot(data=scat_new, x=’x’, y=’y’, hue=’category’, legend=None, s=100)
for row in scat_new.iterrows():
if row[1][‘Label’] != “None”:
ax.text(row[1][‘x’]+.02, row[1][‘y’], str(row[1][‘Label’]))
ax.set(xlabel=’’, ylabel=’’)
plt.show()
plt.clf()

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store