Error when generating Lyrics for our model - python

Hello! I am doing a group project with a friend and we're trying to generate rap lyrics, however,
when we ran the code we get this error and we don't know how to fix it. If you can help us that'd be amazing :). This is a draft of our code and we are currently trying to solve this issue. The Bars2 is a small txt file with just two songs since we want to debug the code. However, when we try to run the model we get the error that you can see. We've been trying to fix it for a few days but it is not working
This is the code
with open('Bars2.txt', encoding="utf8") as f:
dirty_rap_source = f.read()
rap_source = [x.split("\r")[0] for x in dirty_rap_source.split("\n")]
while "" in rap_source:
rap_source.remove("")
while " " in rap_source:
rap_source.remove(" ")
print("Your dataset's first 3 rap lines: ")
print(rap_source[:3])
!pip install markovify
!pip install pronouncing
# Imports
import markovify
import re
import pronouncing
import random
import numpy as np
import os
from keras.models import Sequential
from keras.layers import LSTM
# These are just the parameters of the network.
depth = 4 # depth of the network. changing will require a retrain
maxsyllables = 16 # maximum syllables per line. Change this freely without retraining the network
rap_length = 5 # number of lines in the rap song
epochs_to_train = 10 # how many times the network trains on the whole dataset
def create_network(depth):
# Sequential() creates a linear stack of layers
model = Sequential()
# Adds a LSTM layer as the first layer in the network with
# 4 units (nodes), and a 2x2 tensor (which is the same shape as the
# training data)
model.add(LSTM(4, input_shape=(2, 2), return_sequences=True))
# adds 'depth' number of layers to the network with 8 nodes each
for i in range(depth):
model.add(LSTM(8, return_sequences=True))
# adds a final layer with 2 nodes for the output
model.add(LSTM(2, return_sequences=True))
# prints a summary representation of the model
model.summary()
# configures the learning process for the network / model
# the optimizer function rmsprop: optimizes the gradient descent
# the loss function: mse: will use the "mean_squared_error when trying to improve
model.compile(optimizer='rmsprop',
loss='mse')
#if artist + ".rap" in os.listdir(".") and train_mode == False:
# # loads the weights from the hdf5 file saved earlier
# model.load_weights(str(artist + ".rap"))
# print "loading saved network: " + str(artist) + ".rap"
return model
def markov(text_file):
read = rap_source
# markovify goes line by line of the lyrics.txt file and
# creates a model of the text which allows us to use
# make_sentence() later on to create a bar for lyrics
# creates a probability distribution for all the words
# so it can generate words based on the current word we're on
text_model = markovify.NewlineText(read)
return text_model
# used when generating bars and making sure the length is not longer
# than the max syllables, and will continue to generate bars until
# the amount of syllables is less than the max syllables
def syllables(line):
count = 0
for word in line.split(" "):
vowels = 'aeiouy'
word = word.lower().strip(".:;?!")
#if not word:
#continue
if word[0] in vowels:
count += 1
for index in range(1, len(word)):
if word[index] in vowels and word[index - 1] not in vowels:
count += 1
if word.endswith('e'):
count -= 1
if word.endswith('le'):
count += 1
if count == 0:
count += 1
return count / maxsyllables
# writes a rhyme list to a rhymes file that allows for use when
# building the dataset, and composing the rap
'''def rhymeindex(lyrics):
#if str(artist) + ".rhymes" in os.listdir(".") and train_mode == False:
# print "loading saved rhymes from " + str(artist) + ".rhymes"
# return open(str(artist) + ".rhymes", "r").read().split("\n")
if True:
rhyme_master_list = []
print("Alright, building the list of all the rhymes")
for i in lyrics:
# grabs the last word in each bar
word = re.sub(r"\W+", '', i.split(" ")[-1]).lower()
#print(f'Printing word: {word}')
# pronouncing.rhymes gives us a word that rhymes with the word being passed in
rhymeslist = pronouncing.rhymes(word)
# need to convert the unicode rhyme words to UTF8
rhymeslist = [x.encode('UTF8') for x in rhymeslist]
# rhymeslistends contains the last two characters for each word
# that could potentially rhyme with our word
rhymeslistends = []
for i in rhymeslist:
rhymeslistends.append(i[-2:])
try:
# rhymescheme gets all the unique two letter endings and then
# finds the one that occurs the most
rhymescheme = max(set(rhymeslistends), key=rhymeslistends.count)
except Exception:
rhymescheme = word[-2:]
rhyme_master_list.append(rhymescheme)
# rhyme_master_list is a list of the two letters endings that appear
# the most in the rhyme list for the word
rhyme_master_list = list(set(rhyme_master_list))
reverselist = [x[::-1] for x in rhyme_master_list]
reverselist = sorted(str(reverselist))
print(f'Sorted reverselist: {reverselist}')
# rhymelist is a list of the two letter endings (reversed)
# the reason the letters are reversed and sorted is so
# if the network messes up a little bit and doesn't return quite
# the right values, it can often lead to picking the rhyme ending next to the
# expected one in the list. But now the endings will be sorted and close together
# so if the network messes up, that's alright and as long as it's just close to the
# correct rhymes
rhymelist = [x[::-1] for x in reverselist]
#f = open(str(artist) + ".rhymes", "w")
#f.write("\n".join(rhymelist))
#f.close()
print(rhymelist)
return rhymelist'''
def rhymeindex(lyrics):
if str('bobo') + ".rhymes" in os.listdir("."):
print ("loading saved rhymes from " + str('bobo') + ".rhymes")
return open(str('bobo') + ".rhymes", "r",encoding='utf-8').read().split("\n")
else:
rhyme_master_list = []
print ("Building list of rhymes:")
for i in lyrics:
word = re.sub(r"\W+", '', i.split(" ")[-1]).lower()
rhymeslist = pronouncing.rhymes(word)
rhymeslistends = []
for i in rhymeslist:
rhymeslistends.append(i[-2:])
try:
rhymescheme = max(set(rhymeslistends), key=rhymeslistends.count)
except Exception:
rhymescheme = word[-2:]
rhyme_master_list.append(rhymescheme)
rhyme_master_list = list(set(rhyme_master_list))
reverselist = [x[::-1] for x in rhyme_master_list]
reverselist = sorted(reverselist)
rhymelist = [x[::-1] for x in reverselist]
print("List of Sorted 2-Letter Rhyme Ends:")
print(rhymelist)
f = open(str('bobo') + ".rhymes", "w", encoding='utf-8')
f.write("\n".join(rhymelist))
f.close()
return rhymelist
# converts the index of the most common rhyme ending
# into a float
def rhyme(line, rhyme_list):
word = re.sub(r"\W+", '', line.split(" ")[-1]).lower()
rhymeslist = pronouncing.rhymes(word)
rhymeslist = [x.encode('UTF8') for x in rhymeslist]
rhymeslistends = []
for i in rhymeslist:
rhymeslistends.append(i[-2:])
try:
rhymescheme = max(set(rhymeslistends), key=rhymeslistends.count)
except Exception:
rhymescheme = word[-2:]
try:
float_rhyme = rhyme_list.index(rhymescheme)
float_rhyme = float_rhyme / float(len(rhyme_list))
return float_rhyme
except Exception:
return None
# grabs each line of the lyrics file and puts them
# in their own index of a list, and then removes any empty lines
# from the lyrics file and returns the list as bars
def split_lyrics_file(text):
#text = open(text_file).read()
#text = text.split("\n")
while "" in text:
text.remove("")
return text
# only ran when not training
def generate_lyrics(lyrics_file):
bars = []
last_words = []
lyriclength = len(lyrics_file)
count = 0
markov_model = markov((". ").join(lyrics_file) + ".")
while len(bars) < lyriclength / 9 and count < lyriclength * 2:
# By default, the make_sentence method tries, a maximum of 10 times per invocation,
# to make a sentence that doesn't overlap too much with the original text.
# If it is successful, the method returns the sentence as a string.
# If not, it returns None. (https://github.com/jsvine/markovify)
bar = markov_model.make_sentence()
# make sure the bar isn't 'None' and that the amount of
# syllables is under the max syllables
if type(bar) != type(None) and syllables(bar) < 1:
# function to get the last word of the bar
def get_last_word(bar):
last_word = bar.split(" ")[-1]
# if the last word is punctuation, get the word before it
if last_word[-1] in "!.?,":
last_word = last_word[:-1]
return last_word
last_word = get_last_word(bar)
# only use the bar if it is unique and the last_word
# has only been seen less than 3 times
if bar not in bars and last_words.count(last_word) < 3:
bars.append(bar)
last_words.append(last_word)
count += 1
return bars
# used to construct the 2x2 inputs for the LSTMs
# the lyrics being passed in are lyrics (original lyrics if being trained,
# or ours if it's already trained)
def build_dataset(lyrics, rhyme_list):
dataset = []
line_list = []
# line_list becomes a list of the line from the lyrics, the syllables for that line (either 0 or 1 since
# syllables uses integer division by maxsyllables (16)), and then rhyme returns the most common word
# endings of the words that could rhyme with the last word of line
for line in lyrics:
line_list = [line, syllables(line), rhyme(line, rhyme_list)]
dataset.append(line_list)
x_data = []
y_data = []
# using range(len(dataset)) - 3 because of the way the indices are accessed to
# get the lines
for i in range(len(dataset) - 3):
line1 = dataset[i][1:]
line2 = dataset[i + 1][1:]
line3 = dataset[i + 2][1:]
line4 = dataset[i + 3][1:]
# populate the training data
# grabs the syllables and rhyme index here
x = [line1[0], line1[1], line2[0], line2[1]]
x = np.array(x)
# the data is shaped as a 2x2 array where each row is a
# [syllable, rhyme_index] pair
x = x.reshape(2, 2)
# populate the target data
y = [line3[0], line3[1], line4[0], line4[1]]
y = np.array(y)
y = y.reshape(2, 2)
if type(x) is np.ndarray and type(y) is np.ndarray:
x_data.append(x)
y_data.append(y)
else:
continue
# returns the 2x2 arrays as datasets
x_data = np.array(x_data)
y_data = np.array(y_data)
# print "x shape " + str(x_data.shape)
# print "y shape " + str(y_data.shape)
return x_data, y_data
# only used when not training
def compose_rap(lines, rhyme_list, lyrics_file, model):
rap_vectors = []
human_lyrics = split_lyrics_file(lyrics_file)
# choose a random line to start in from given lyrics
initial_index = random.choice(range(len(human_lyrics) - 1))
# create an initial_lines list consisting of 2 lines
initial_lines = human_lyrics[initial_index:initial_index + 8]
starting_input = []
for line in initial_lines:
# appends a [syllable, rhyme_index] pair to starting_input
starting_input.append([syllables(line), rhyme(line, rhyme_list)])
print(f'Printing starting_input: {starting_input}')
# predict generates output predictions for the given samples
# it's reshaped as a (1, 2, 2) so that the model can predict each
# 2x2 matrix of [syllable, rhyme_index] pairs
starting_vectors = model.predict(np.array([starting_input]).flatten().reshape(4, 2, 2).astype('float32'))
rap_vectors.append(starting_vectors)
print(f'Printing starting_vectors: {starting_vectors}')
for i in range(rap_length):
rap_vectors.append(model.predict(np.array([rap_vectors[-1]]).flatten().reshape(4, 2, 2)))
print(f'Printing rap_vectors: {rap_vectors}')
return rap_vectors
def vectors_into_song(vectors, generated_lyrics, rhyme_list):
print("\n\n")
print("About to write rap (this could take a moment)...")
print("\n\n")
# compare the last words to see if they are the same, if they are
# increment a penalty variable which grants penalty points for being
# uncreative
def last_word_compare(rap, line2):
penalty = 0
for line1 in rap:
word1 = line1.split(" ")[-1]
word2 = line2.split(" ")[-1]
# remove any punctuation from the words
#while word1[-1] in "?!,.":
#word1 = word1[:-1]
#while word2[-1] in "?!,.":
#word2 = word2[:-1]
if word1 == word2:
penalty += 0.2
return penalty
# vector_half is a single [syllable, rhyme_index] pair
# returns a score rating for a given line
def calculate_score(vector_half, syllables, rhyme, penalty):
print(f'vector_half: {vector_half}, syllables: {syllables}, rhyme: {rhyme}, penalty: {penalty}')
desired_syllables = vector_half[0]
desired_rhyme = vector_half[1]
# desired_syllables is the number of syllables we want
desired_syllables = desired_syllables * maxsyllables
# desired rhyme is the index of the rhyme we want
desired_rhyme = desired_rhyme * len(rhyme_list)
# generate a score by subtracting from 1 the sum of the difference between
# predicted syllables and generated syllables and the difference between
# the predicted rhyme and generated rhyme and then subtract the penalty
score = 1.0 - (abs((float(desired_syllables) - float(syllables))) + abs(
(float(desired_rhyme) - float(rhyme)))) - penalty
return score
# generated a list of all the lines from generated_lyrics with their
# line, syllables, and rhyme float value
dataset = []
for line in generated_lyrics:
line_list = [line, syllables(line), rhyme(line, rhyme_list)]
dataset.append(line_list)
rap = []
vector_halves = []
for vector in vectors:
# vectors are the 2x2 rap_vectors (predicted bars) generated by compose_rap()
# separate every vector into a half (essentially one bar) where each
# has a pair of [syllables, rhyme_index]
vector_halves.append(list(vector[0][0]))
vector_halves.append(list(vector[0][1]))
for vector in vector_halves:
# Each vector (predicted bars) is scored against every generated bar ('item' below)
# to find the generated bar that best matches (highest score) the vector predicted
# by the model. This bar is then added to the final rap and also removed from the
# generated lyrics (dataset) so that we don't get duplicate lines in the final rap.
scorelist = []
for item in dataset:
# item is one of the generated bars from the Markov model
line = item[0]
if len(rap) != 0:
penalty = last_word_compare(rap, line)
else:
penalty = 0
# calculate the score of the current line
total_score = calculate_score(vector, item[1], item[2], penalty)
score_entry = [line, total_score]
# add the score of the current line to a scorelist
scorelist.append(score_entry)
fixed_score_list = []
for score in scorelist:
fixed_score_list.append(float(score[1]))
# get the line with the max valued score from the fixed_score_list
max_score = max(fixed_score_list)
for item in scorelist:
if item[1] == max_score:
# append item[0] (the line) to the rap
rap.append(item[0])
print(str(item[0]))
# remove the line we added to the rap so
# it doesn't get chosen again
for i in dataset:
if item[0] == i[0]:
dataset.remove(i)
break
break
return rap
def train(x_data, y_data, model):
# fit is used to train the model for 5 'epochs' (iterations) where
# the x_data is the training data, and the y_data is the target data
# x is the training and y is the target data
# batch_size is a subset of the training data (2 in this case)
# verbose simply shows a progress bar
model.fit(np.array(x_data), np.array(y_data),
batch_size=4,
epochs=epochs_to_train,
verbose=1)
# save_weights saves the best weights from training to a hdf5 file
#model.save_weights(artist + ".rap")
def main(depth):
train_mode = True
model = create_network(depth)
# change the lyrics file to the file with the lyrics you want to be trained on
text_file = rap_source
bars = split_lyrics_file(text_file)
rhyme_list = rhymeindex(bars)
print(f'rhyme_list: {rhyme_list}')
x_data, y_data = build_dataset(bars, rhyme_list)
x_data = np.asarray(x_data).astype('float32')
y_data = np.asarray(y_data).astype('float32')
train(x_data, y_data, model)
bars = generate_lyrics(text_file)
vectors = compose_rap(bars, rhyme_list, text_file, model)
print(f'Printing vectors: {vectors}')
rap = vectors_into_song(vectors, bars, rhyme_list)
for bar in rap:
print(bar)
This is the error
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 2, 4) 112
lstm_1 (LSTM) (None, 2, 8) 416
lstm_2 (LSTM) (None, 2, 8) 544
lstm_3 (LSTM) (None, 2, 8) 544
lstm_4 (LSTM) (None, 2, 8) 544
lstm_5 (LSTM) (None, 2, 2) 88
=================================================================
Total params: 2,248
Trainable params: 2,248
Non-trainable params: 0
_________________________________________________________________
Building list of rhymes:
List of Sorted 2-Letter Rhyme Ends:
['m3', 'ca', 'ia', 'la', 'pa', 'ra', 'ta', 'ib', 'ob', 'ed', 'nd', 'rd', 'ud', 'ce', 'de', 'ee', 'ge', 'ke', 'le', 'me', 'ne', 're', 'se', 'te', 'ze', 'lf', 'ag', 'ng', 'og', 'ah', 'ch', 'gh', 'oh', 'sh', 'th', 'uh', 'ai', 'li', 'ni', 'si', 'zi', 'ck', 'rk', 'al', 'el', 'll', 'rl', 'am', 'im', 'um', 'an', 'en', 'in', 'on', 'un', 'wn', 'ao', 'co', 'jo', 'ko', 'mo', 'so', 'ap', 'ip', 'pp', 'ar', 'er', 'ur', "'s", '0s', 'as', 'cs', 'ds', 'es', 'gs', 'ks', 'ls', 'ms', 'ns', 'ps', 'rs', 'ss', 'ts', 'us', "'t", 'at', 'ct', 'et', 'it', 'nt', 'rt', 'tt', 'ut', 'au', 'ew', 'ow', 'ay', 'by', 'ly', 'ny', 'py', 'ty', 'iz', 'tz']
rhyme_list: ['m3', 'ca', 'ia', 'la', 'pa', 'ra', 'ta', 'ib', 'ob', 'ed', 'nd', 'rd', 'ud', 'ce', 'de', 'ee', 'ge', 'ke', 'le', 'me', 'ne', 're', 'se', 'te', 'ze', 'lf', 'ag', 'ng', 'og', 'ah', 'ch', 'gh', 'oh', 'sh', 'th', 'uh', 'ai', 'li', 'ni', 'si', 'zi', 'ck', 'rk', 'al', 'el', 'll', 'rl', 'am', 'im', 'um', 'an', 'en', 'in', 'on', 'un', 'wn', 'ao', 'co', 'jo', 'ko', 'mo', 'so', 'ap', 'ip', 'pp', 'ar', 'er', 'ur', "'s", '0s', 'as', 'cs', 'ds', 'es', 'gs', 'ks', 'ls', 'ms', 'ns', 'ps', 'rs', 'ss', 'ts', 'us', "'t", 'at', 'ct', 'et', 'it', 'nt', 'rt', 'tt', 'ut', 'au', 'ew', 'ow', 'ay', 'by', 'ly', 'ny', 'py', 'ty', 'iz', 'tz']
Epoch 1/10
96/96 [==============================] - 14s 11ms/step - loss: nan
Epoch 2/10
96/96 [==============================] - 1s 12ms/step - loss: nan
Epoch 3/10
96/96 [==============================] - 1s 10ms/step - loss: nan
Epoch 4/10
96/96 [==============================] - 1s 11ms/step - loss: nan
Epoch 5/10
96/96 [==============================] - 1s 12ms/step - loss: nan
Epoch 6/10
96/96 [==============================] - 1s 12ms/step - loss: nan
Epoch 7/10
96/96 [==============================] - 1s 11ms/step - loss: nan
Epoch 8/10
96/96 [==============================] - 1s 10ms/step - loss: nan
Epoch 9/10
96/96 [==============================] - 1s 11ms/step - loss: nan
Epoch 10/10
96/96 [==============================] - 1s 11ms/step - loss: nan
Printing starting_input: [[0.4375, None]]
Printing starting_input: [[0.4375, None], [0.4375, None]]
Printing starting_input: [[0.4375, None], [0.4375, None], [0.4375, None]]
Printing starting_input: [[0.4375, None], [0.4375, None], [0.4375, None], [0.625, None]]
Printing starting_input: [[0.4375, None], [0.4375, None], [0.4375, None], [0.625, None], [0.625, None]]
Printing starting_input: [[0.4375, None], [0.4375, None], [0.4375, None], [0.625, None], [0.625, None], [0.9375, None]]
Printing starting_input: [[0.4375, None], [0.4375, None], [0.4375, None], [0.625, None], [0.625, None], [0.9375, None], [0.625, None]]
Printing starting_input: [[0.4375, None], [0.4375, None], [0.4375, None], [0.625, None], [0.625, None], [0.9375, None], [0.625, None], [0.625, None]]
Printing starting_vectors: [[[nan nan]
[nan nan]]
[[nan nan]
[nan nan]]
[[nan nan]
[nan nan]]
[[nan nan]
[nan nan]]]
Printing rap_vectors: [array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32)]
Printing rap_vectors: [array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32)]
Printing rap_vectors: [array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32)]
Printing rap_vectors: [array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32)]
Printing rap_vectors: [array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32)]
Printing vectors: [array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32), array([[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]],
[[nan, nan],
[nan, nan]]], dtype=float32)]
About to write rap (this could take a moment)...
vector_half: [nan, nan], syllables: 0.6875, rhyme: None, penalty: 0
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-13-e316dd91dea0> in <module>
----> 1 main(depth) # run this to get the actual rap
<ipython-input-12-2899ad922142> in main(depth)
433 vectors = compose_rap(bars, rhyme_list, text_file, model)
434 print(f'Printing vectors: {vectors}')
--> 435 rap = vectors_into_song(vectors, bars, rhyme_list)
436
437 for bar in rap:
<ipython-input-12-2899ad922142> in vectors_into_song(vectors, generated_lyrics, rhyme_list)
371 penalty = 0
372 # calculate the score of the current line
--> 373 total_score = calculate_score(vector, item[1], item[2], penalty)
374 score_entry = [line, total_score]
375 # add the score of the current line to a scorelist
<ipython-input-12-2899ad922142> in calculate_score(vector_half, syllables, rhyme, penalty)
335 # the predicted rhyme and generated rhyme and then subtract the penalty
336 score = 1.0 - (abs((float(desired_syllables) - float(syllables))) + abs(
--> 337 (float(desired_rhyme) - float(rhyme)))) - penalty
338
339 return score
TypeError: float() argument must be a string or a number, not 'NoneType'

Related

sklearn.feature_selection.chi2 returns list of NaN values

I have the following dataset (I will upload only a sample of 4 rows, the real one has 15,000 rows):
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_selection import chi2
quotes=["Sip N Shop Come thru right now Marjais PopularNobodies MMR Marjais SipNShop",
"I do not know about you but My family and I will not take the Covid19 vaccine anytime soon",
"MSignorile Immunizations should be mandatory Period In Oklahoma they will not let kids go to school without them It is dangerous otherwise",
"President Obama spoke in favor of vaccination for children Fox will start telling its viewers to choose against vaccination in 321"]
labels=[0,1,2,0]
dummy = pd.DataFrame({"quote": quotes, "label":labels})
And I want to apply the famous chi square test to eliminate the number of irrelevant words per category (0,1,2). Where 0: neutral, 1: positive, 2: negative.
Below is my approach (similar to the approach implemented here)
Briefly, I create an empty list of 0's equal to corpus length. 0's represent the first label of y = 0. For the second label (1=positive) I will create an empty list 1's. Similarly for the third label (2=negative).
After applying this 3 times (for each of the target labels), I will then have three 3 lists with the most dependent words per label. This final list will be my new vocabulary for the TF-IDF vectorizer.
def tweeter_tokenizer(tweet):
return tweet.split(' ')
vectorizer = TfidfVectorizer(tokenizer=tweeter_tokenizer, ngram_range=(1,2), stop_words=english_stopwords)
vectorizer.fit(dummy["quote"])
X_train = vectorizer.transform(dummy["quote"])
y_train = dummy["label"]
feature_names = vectorizer.get_feature_names_out()
y_neutral = np.array([0]*X_train.shape[0])
pValue = 0.90
chi_neutral, p_neutral = chi2(X_train, y_neutral)
chi_neutral
The chi_neutral object is:
array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan])
At the end I want to create a dataframe equal to the length of unique tokens (feature_names) per label. And I will keep only the words with score > pValue. The dataframe will show me how many from the total tokens of the corpus are dependent to class 0 (neutral). The same approach will be followed for the rest of the labels (1: positive, 2: negative).
y_df = np.array([0]*X_train.shape[1])
tokens_neutral_dependent = pd.DataFrame({
"tweet_token": feature_names,
"chi2_score" : 1-p_neutral,
"neutral_label": y_df #length = length of feature_names()
})
tokens_neutral_dependent = tokens_neutral_dependent.sort_values(["neutral_label","chi2_score"], ascending=[True,False])
tokens_neutral_dependent = tokens_neutral_dependent[tokens_neutral_dependent["chi2_score"]>pValue]
tokens_neutral_dependent.shape
I don't think it's really meaningful to compute the chi-squared statistic without having the classes attached. The code chi2(X_train, y_neutral) is asking "Assuming that class and the parameter are independent, what are the odds of getting this distribution?" But all of the examples you're showing it are the same class.
I would suggest this instead:
chi_neutral, p_neutral = chi2(X_train, y_train)
If you're interested in chi-square statistics between particular classes, you can filter the dataset first to just two classes, then run the chi-squared test. But this step is not necessary.

search string in pandas column

I am trying to find a substring in below hard_skills_name column, like i want all rows which has 'Apple Products' as hard skill.
I tried below code:
df.loc[df['hard_skills_name'].str.contains("Apple Products", case=False)]
but getting this error:
KeyError Traceback (most recent call last)
<ipython-input-49-acdcdfbdfd3d> in <module>
----> 1 df.loc[df['hard_skills_name'].str.contains("Apple Products", case=False)]
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
877
878 maybe_callable = com.apply_if_callable(key, self.obj)
--> 879 return self._getitem_axis(maybe_callable, axis=axis)
880
881 def _is_scalar_access(self, key: Tuple):
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
1097 raise ValueError("Cannot index with multidimensional key")
1098
-> 1099 return self._getitem_iterable(key, axis=axis)
1100
1101 # nested tuple slicing
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_iterable(self, key, axis)
1035
1036 # A collection of keys
-> 1037 keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
1038 return self.obj._reindex_with_indexers(
1039 {axis: [keyarr, indexer]}, copy=True, allow_dups=True
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1252 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1253
-> 1254 self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
1255 return keyarr, indexer
1256
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1296 if missing == len(indexer):
1297 axis_name = self.obj._get_axis_name(axis)
-> 1298 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1299
1300 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "None of [Float64Index([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n nan, nan, nan, nan, nan, nan, nan, nan, nan],\n dtype='float64')] are in the [index]"
Try to chain (temporarily) conversion of the list of strings to comma separated strings by str.join() before string search:
df[df['hard_skills_name'].str.join(', ').str.contains("Apple Products", case=False)]
The problem was owing to the string you are going to search is contained within a list. You cannot search the string in list directly with .str.contains(). To solve it, you can convert the list of strings to a long string first (e.g. with commas separating the substrings) by .str.join() before doing your string search.
Your index has null values. You're going to have to make a boolean mask for this. Directly answering your question:
df.loc[(df.index.notnull()) & (df['hard_skills_name'].str.contains("Apple Products", case=False))]
This should exclude anything that has null index values and does contain the given string in hard_skills_name
However, I suspect that this will also exclude some data that you're looking for. The solution in that case would be to change your index to not have any NaNs. Whether that means replacing it with a placeholder value or creating a brand new index, that's up to you.

Calculating numeric list with string values

I have a numeric list with NaN values and I want to apply mathematical functions to it. Also I need keep those NaN values to be stored still after computation
list_a = [1827.07, 1376.21, nan, nan, 1001.88, 978.07]
recal_list = []
for i in list_a:
time = round(i/55)
recal_list.append(time)
You could use a pandas Series
from pandas import Series
from numpy import nan
list_a = [1827.07, 1376.21, nan, nan, 1001.88, 978.07]
result = round(Series(list_a) / 55)
print(result.tolist()) # [33.0, 25.0, nan, nan, 18.0, 18.0]
Or your solution, with an if
from numpy import nan, isnan
list_a = [1827.07, 1376.21, nan, nan, 1001.88, 978.07]
recal_list = []
for val in list_a:
recal_list.append(val if isnan(val) else round(val / 55))
print(recal_list) # [33.0, 25.0, nan, nan, 18.0, 18.0]

read a matrix from a text file into numpy

I am using a software which outputs only the upper triangle of a symmetric matrix in the following format:
2 3 4 5 6 7 8
1: -0.00 0.09 0.03 -0.27 -0.28 0.83 -0.31
2: 0.09 0.03 -0.26 -0.28 0.83 -0.31
3: 0.00 0.11 0.11 0.33 0.10
4: 0.03 0.03 -0.00 0.03
5: -0.02 0.91 -0.04
6: 0.92 -0.03
7: 0.91
I would like to plot this matrix in a heatmap. However, I have a problem in reading this
text file into a data structure. How could I turn this text file into a for example, numpy array which I could use as a matrix for plotting?
Thank you!
If I read in your text file correctly, you can read in the file using pandas with space delimiter:
import pandas as pd
import numpy as np
dat = pd.read_csv("test.txt",index_col=0,delimiter='\s+').to_numpy()
Looks like this:
array([[-0. , 0.09, 0.03, -0.27, -0.28, 0.83, -0.31],
[ 0.09, 0.03, -0.26, -0.28, 0.83, -0.31, nan],
[ 0. , 0.11, 0.11, 0.33, 0.1 , nan, nan],
[ 0.03, 0.03, -0. , 0.03, nan, nan, nan],
[-0.02, 0.91, -0.04, nan, nan, nan, nan],
[ 0.92, -0.03, nan, nan, nan, nan, nan],
[ 0.91, nan, nan, nan, nan, nan, nan]])
So we just need to invert the nan:
idx = np.arange(dat.shape[1])
arr = np.empty(dat.shape)
for i in range(dat.shape[1]):
arr[i] = dat[i][np.concatenate([idx[-i:],idx[:-i]])]
And the end result looks like this:
arr
array([[-0. , 0.09, 0.03, -0.27, -0.28, 0.83, -0.31],
[ nan, 0.09, 0.03, -0.26, -0.28, 0.83, -0.31],
[ nan, nan, 0. , 0.11, 0.11, 0.33, 0.1 ],
[ nan, nan, nan, 0.03, 0.03, -0. , 0.03],
[ nan, nan, nan, nan, -0.02, 0.91, -0.04],
[ nan, nan, nan, nan, nan, 0.92, -0.03],
[ nan, nan, nan, nan, nan, nan, 0.91]])
I could come up with the following solution:
t = open("test_fit")
long_l = []
for line in t:
line = line.rstrip().split()
long_l.append(line[1:])
long_l_new = long_l[1:]
print(long_l_new)
for index, item in enumerate(long_l_new):
print(index, item)
item.insert(0, '0')
long_l_new.append(['0'])
mat = []
for index, item in enumerate(long_l_new):
if index == 0:
to_insert = long_l_new[index][index + 1]
new_l = long_l_new[index + 1]
new_l_to_add = new_l.insert(index, to_insert)
else:
if index < len(long_l_new) - 1:
for i in range(0, index+1):
to_insert = long_l_new[i][index + 1]
new_l = long_l_new[index + 1]
new_l.insert(i, to_insert)
Output:
[['0', '-0.00', '0.09', '0.03', '-0.27', '-0.28', '0.83', '-0.31'],
['-0.00', '0', '0.09', '0.03', '-0.26', '-0.28', '0.83', '-0.31'],
['0.09', '0.09', '0', '0.00', '0.11', '0.11', '0.33', '0.10'],
['0.03', '0.03', '0.00', '0', '0.03', '0.03', '-0.00', '0.03'],
['-0.27', '-0.26', '0.11', '0.03', '0', '-0.02', '0.91', '-0.04'],
['-0.28', '-0.28', '0.11', '0.03', '-0.02', '0', '0.92', '-0.03'],
['0.83', '0.83', '0.33', '-0.00', '0.91', '0.92', '0', '0.91'],
['-0.31', '-0.31', '0.10', '0.03', '-0.04', '-0.03', '0.91', '0']]

Remove nan values from a dict in python

I am trying to remove keys with nan values from a dictionary formed from pandas using python. Is there a way I can achieve this.
Here is a sample of my dictionary:
{'id': 1, 'internal_id': '1904', 'first_scraping_time': '2020-04-17 12:44:59.0', 'first_scraping_date': '2020-04-17', 'last_scraping_time': '2020-06-20 03:08:47.0', 'last_scraping_date': '2020-06-20', 'is_active': 1,'flags': nan, 'phone': nan,'size': 60.0, 'available': '20-06-2020', 'timeframe': nan, 'teaser': nan, 'remarks': nan, 'rent': 4984.0, 'rooms': '3', 'downpayment': nan, 'deposit': '14952', 'expenses': 600.0, 'expenses_tv': nan, 'expenses_improvements': nan, 'expenses_misc': nan, 'prepaid_rent': '4984', 'pets': nan, 'furnished': nan, 'residence_duty': nan, 'precision': nan, 'nearby_cities': nan,'type_dwelling': nan, 'type_tenants': nan, 'task_id': '614b8fc2-409c-403a-9650-05939e8a89c7'}
Thank you!
nan is a tricky object to work with because it doesn't equal (or even necessarily share object identity) with anything, including itself.
You can use math.isnan to test for it:
import math
new = {key: value for (key, value) in old.items() if not math.isnan(value)}

Categories

Resources