Spark streaming from a File source - python

i have two notebooks : streaming.ipynb and File.ipynb (tools: spark3.2.1, colab notebook)
the streaming.ipynb:
import sys
from textblob import TextBlob
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
import string
import re
#Emoji patterns
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
# define stopwords
nltk.download('stopwords')
stopword = stopwords.words('english')
# import stemmer from nltk
ps = nltk.PorterStemmer()
# import lemmatizer from nltk
#wn = nltk.WordNetLemmatizer()
# import lemmatizer from nltk
import nltk
from nltk.stem import WordNetLemmatizer
wn = nltk.download('wordnet') #WordNetLemmatizer()
def abb(tweet):
tweet = re.sub(r"im|i'm|iam", "i am", tweet)
tweet = re.sub(r"he's", "he is", tweet)
tweet = re.sub(r"she's", "she is", tweet)
tweet = re.sub(r"that's", "that is", tweet)
tweet = re.sub(r"there's", "there is", tweet)
tweet = re.sub(r"what's", "what is", tweet)
tweet = re.sub(r"where's", "where is", tweet)
tweet = re.sub(r"\'ll", " will", tweet)
tweet = re.sub(r"\'ve", " have", tweet)
tweet = re.sub(r"\'re", " are", tweet)
tweet = re.sub(r"\'d", " would", tweet)
tweet = re.sub(r"\'ve", " have", tweet)
tweet = re.sub(r"won't", "will not", tweet)
tweet = re.sub(r"don't", "do not", tweet)
tweet = re.sub(r"did't", "did not", tweet)
tweet = re.sub(r"can't", "can not", tweet)
tweet = re.sub(r"it's", "it is", tweet)
tweet = re.sub(r"couldn't", "could not", tweet)
tweet = re.sub(r"wouldn't", "would not", tweet)
tweet = re.sub(r"have't", "have not", tweet)
return tweet
def getSubjectivity(text):
return TextBlob(text).sentiment.subjectivity
#Create a function to get the polarity
def getPolarity(text):
return TextBlob(text).sentiment.polarity
def getAnalysis(score):
if score < 0:
return "Negative"
elif score == 0:
return "Neutral"
else:
return "Positive"
def main():
sc = SparkContext(appName="PysparkStreaming")
ssc = StreamingContext(sc, 5) #Streaming will execute in each 3 seconds
lines = ssc.textFileStream('/content/drive/MyDrive/Colab Notebooks/log/') #'log/ mean directory name
counts = lines.flatMap(lambda line: line.split("")) \
.map(lambda x: x.lower())\
.map(lambda x: re.sub(r"#[A-Za-z0-9]+", "", x))\
.map(lambda x: re.sub(r"\\x[A-Za-z][0-9]+", "", x))\
.map(lambda x: re.sub(r"\\u[0-9][A-Za-z]|\\U[0-9][A-Za-z]+", "", x))\
.map(lambda x: re.sub(r'&amp;', '&', x)) \
.map(lambda x: re.sub(r"b[\s]+", "", x)) \
.map(lambda x: re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", x)) \
.map(lambda x: abb(x)) \
.map(lambda x: re.sub("[^a-zA-Z0-9\s]+", "",x)) \
.map(lambda x: re.sub('[0-9]+', ' ', x))\
.map(lambda x: emoji_pattern.sub(r'', x) )\
.map(lambda x: "".join([char for char in x if char not in string.punctuation])) \
.map(lambda x: re.split('\W+', x)) \
.map(lambda x: " ".join(x))\
.map(lambda x: [word for word in x if word not in stopword])\
.map(lambda x: getPolarity(x))
counts.pprint()
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
main()
and File.ipynb:
from random import randint
import time
"""
This is use for create 30 file one by one in each 5 seconds interval.
These files will store content dynamically from 'lorem.txt' using below code
"""
def main():
a = 1
with open('/content/drive/MyDrive/Colab Notebooks/lorem.txt', 'r',encoding='latin-1') as file: # reading content from 'lorem.txt' file
lines = file.readlines()
while a <= 30:
totalline = len(lines)
linenumber = randint(0, totalline - 10)
with open('/content/drive/MyDrive/Colab Notebooks/log/log.txt'.format(a), 'w') as writefile:
writefile.write(' '.join(line for line in lines[linenumber:totalline]))
print('creating file log{}.txt'.format(a))
a += 1
time.sleep(5)
if __name__ == '__main__':
main()
Notes:
in File.ipynb, i read lorem.txt which contains tweets and separate data in log.txt which is NULL but when you run the code it will be create log{1} contains a part of tweets, log{2} also and so on.... and streaming.ipynb stream from those logs each 5 seconds print the result, lorem.txt contains:
'A woman faces 100 lashes and up to seven years in prison because she was a VICTIM of RAPE in Qatar 🇶🇦\n\nShe was told she could possibly avoid criminal charges if she married 💍her attacker\n\n#FIFA22 #WorldCup2022 #Qatar2022 #Qatar'
'#WorldCup2022 '
'i love my mother'
'it is so bad!'
'Black Stars technical team among 32 for 2022 Aspire Academy Global Summit\n\n- \n\n.#Ghana #WorldCup2022 '
'🔊New Episode 163 - The Business End out now!!! #WorldCup2022 #TorontoFC #RealMadridChelsea #LiverpoolVillarreal #ManCityRealMadrid #ARSMUN #ACMILAN #ChampionsLeague #uefachampionsleague \nOut now on all popular audio podcast platforms🎶 '
'Check out new work on my #Behance profile: "Edited" \n#edit #photoshop #photography #road #street #art #design #architect #amazon #quote #fun #amazing #EidMubarak #ukraine #putin #gold #oil #WorldCup2022'
'Can we talk about how bad the Al Rihla looks #WorldCup2022 '
'They should hire Mourinho as a consultant for the #WorldCup2022'
'#yuzi_chahal #imkuldeep18 The spin duo back with bang bang performance for representing franchise Eagerly Waiting to see in India team behind the stump by Rishaph pant The selectors are very big problem to pick a players for #WorldCup2022\n#IPL20222'
'#DraganflyInc Draganflyer Commander can sanitize the entire 50,000 seater stadium in 4-6 hrs. The #WorldCup2022 in #Qatar2022 is slowly approaching. This drone could really help keep stadiums safe. Food for thought.\n\n#CovidIsntOver \n#GOPTaxScam \n#IMPEACHBIDENNOW \n#BTC \n#NFTs '
'NFT News 👇\n\nTooneyChain: when NFTs invite themselves to the 2022 football world cup\n#TooneyChainNFT \n\n#WorldCup2022 #football #NFTs #NFTcollectibles '
'Mark my words. 3-1 USA over England this fall in the #WorldCup2022 . If we win, England has to call it soccer from now on.'
'you can as well use our services in projects that are not permanent like #Containerhouse or #structures for #WorldCup2022 ,also for #camps ,#swimmingpool #parks #offices #dormitory'
'#bhogleharsha Than we need to forgot about #BhuviOfficial ?👀\n#ipl2022 #WorldCup2022'
'Ghana🇬🇭 and Tunisia🇹🇳 have been invited to take part in the Kirin Super Cup 2022, which will be hosted by Japan from Friday, June 10 to Tuesday, June 14, 2022, in the lead up to the #WorldCup2022 in Qatar.\n\nThe other 2 participants are hosts Japan🇯🇵 and Chile🇨🇱 #AfricanFootball'
'join us on TG for our game of the day!⚽️⚽️ The World Cup token team is rooting for a #Cristiano goal today! Good luck to both sides today! #ManUtd 🔥 #ChelseaFC $WCT #WCT #WctArmy #WorldCup2022 #Qatar2022 #100xgem #BSC #BNB\xa0\xa0\xa0#BSCGem #BNBchain #1000xgem #moonshot #ETH #CRO #BTC '
'#julietbawuah #tv3_ghana Wow, super talented player. Who knows, with him on #Qatar’s team they might just win the #WorldCup2022. #Qatar2022 #Fooball #games'
'#ForthHelena #thevinetway #jodyvance Jody are you a QA asset?\nCan Jody tweet about LGBTQ issues? Notable events from around the world ignored & Jody is regurgitating CNn narratives?\nToo many ads not enough of what you want? SiriusXM commercial free entertainment.\n#Canucks \n#WorldCup2022\n#NFLDraft\n#TikTok '
'Congratulations, Danyel! You’re really having an impact in this field at just the right time! This, alongside your new co-authored book on #Qatar & the #WorldCup2022 , AND our #CIRSGUQ research project on the same subject! ! 👏👏 '
'Its the crypto world cup final and you are the manager.\nWhich front 3 are you going with?\n\n#xrp #vet #qnt #ada #zil #xlm #doge #btc #cro #eth #sand #luna #WorldCup2022\n#cryptocurrency'
'#yuzi_chahal I Would like to see KulCha Combination in #WorldCup2022'
'It’s going to be scary, but not for us 🇵🇹\n\n#Portugal #WorldCup2022 '
the error is shown below:
Py4JJavaError Traceback (most recent call last)
<ipython-input-5-bb31ef108289> in <module>()
88
89 if __name__ == "__main__":
---> 90 main()
3 frames
/content/spark-3.2.1-bin-hadoop3.2/python/lib/py4j-0.10.9.3-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o23.awaitTermination.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/streaming/util.py", line 68, in call
r = self.func(t, *rdds)
File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/streaming/dstream.py", line 170, in takeAndPrint
taken = rdd.take(num + 1)
File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/rdd.py", line 1568, in take
res = self.context.runJob(self, takeUpToNumLeft, p)
File "/content/spark-3.2.1-bin-hadoop3.2/python/pyspark/context.py", line 1227, in runJob
sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py", line 1322, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/py4j-0.10.9.3-src.zip/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0) (9557898acd7f executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 619, in main
process()
File "/content/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 611, in process
please how can i resolve it?

I see two problems with your code:
As far as I understood you want to create a directory /content/drive/MyDrive/Colab Notebooks/log/ with files following the log<number>.txt pattern. There seems to be a bug in this line:
with open('/content/drive/MyDrive/Colab Notebooks/log/log.txt'.format(a), 'w') as writefile:
You don't actually create multiple files but overwriting the single log.txt file on every iteration due to the missing curly brackets in filepath.
The second problem is much more complex: you are writing to the local filesystem in File.ipynb. StreamingContext.textFileStream accepts HDFS filepath as an argument. Please read the following chapter in the documentation: https://spark.apache.org/docs/latest/streaming-programming-guide.html

Related

"IndexError: list index out of range" When creating an automated response bot

Im creating a Chatbot which uses questions from a CSV file and checks similarity using SKlearn and NLTK, However im getting an error if the same input is entered twice:
This is the main code that takes the user input and outputs an answer to the user:
import pandas as pd
data=pd.read_csv('FootballQA.csv')
question=data['Q'].tolist()
answer=data['A'].tolist()
lemmer = nltk.stem.WordNetLemmatizer()
#WordNet is a semantically-oriented dictionary of English included in NLTK.
def LemTokens(tokens):
return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey","how are you")
GREETING_RESPONSES = ["hi", "hey", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence):
for word in sentence.split():
if word.lower() in GREETING_INPUTS:
return random.choice(GREETING_RESPONSES)
GI = ("how are you")
GR = ["i'm fine","good,how can i help you!"]
def greet(sentence):
for word in sentence.split():
if word.lower() in GREETING_INPUTS:
return random.choice(GREETING_RESPONSES)
def responses(user):
response=''
question.append(user)
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf = TfidfVec.fit_transform(question)
val = cosine_similarity(tfidf[-1], tfidf)
id1=val.argsort()[0][-2]
flat = val.flatten()
flat.sort()
req = flat[-2]
if(req==0):
robo_response=response+"I am sorry! I don't understand you"
return robo_response
else:
response = response+answer[id1]
question.remove(user)
return response
command=1
while(command):
v = input("Enter your value: ")
if(v=="exit"):
command=0
else:
print(responses(str(v)))
When the program runs it asks the user for their input however the problem happens if the same input is entered twice, if i enter "football" it will first correctly display the output i want but then a second time will stop the program and im given this error:
Enter your value: scored
Alan shearer holds the goal record in the premier league.
Enter your value: football
I am sorry! I don't understand you
Enter your value: football
Traceback (most recent call last):
File "C:\Users\Chris\Desktop\chatbot_simple\run.py", line 79, in <module>
print(responses(str(v)))
File "C:\Users\Chris\Desktop\chatbot_simple\run.py", line 68, in responses
response = response+answer[id1]
IndexError: list index out of range
The csv:
Q,A
Who has scored the most goals in the premier league?,Alan shearer holds the goal record in the premier league.
Who has the most appearences in the premier league?,Gareth Barry has the most appearences in premier league history.
I've tried deleting the variable after each input but it still somehow remembers it, anyone have any ideas ?
Thanks
Chris
answer=data['A'].tolist()
and then later on
id1=val.argsort()[0][-2]
response = response+answer[id1]
So if the anwser don't have id1 in it you will get index out of range. So in your case the len(answer) >= id1 is true.

p_mask=p_mask[span_idx].tolist(),AttributeError: - 'list' object has no attribute 'tolist'

I'm getting the following error when I attempt to pass a question and context to a Transformer pipeline. The abort is actually occurring in the HuggingFace code.
Traceback (most recent call last):
File "pipeline.py", line 66, in <module>
main()
File "pipeline.py", line 63, in main
answer = query(value)
File "pipeline.py", line 45, in query
answer = qa(question=question, context=context)
File "/home/pi/.local/lib/python3.7/site-packages/transformers/pipelines/question_answering.py", line 248, in __call__
return super().__call__(examples[0], **kwargs)
File "/home/pi/.local/lib/python3.7/site-packages/transformers/pipelines/base.py", line 915, in __call__
return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
File "/home/pi/.local/lib/python3.7/site-packages/transformers/pipelines/base.py", line 921, in run_single
model_inputs = self.preprocess(inputs, **preprocess_params)
File "/home/pi/.local/lib/python3.7/site-packages/transformers/pipelines/question_answering.py", line 316, in preprocess
p_mask=p_mask[span_idx].tolist(),
AttributeError: 'list' object has no attribute 'tolist'
I am executing the code on a Raspberry Pi 4 arm71 CPU. I am using Transformer version 4.11.2 and PyTorch version 1.7.0a0. The code executes successfully under Windows 10.
Here is the code...
# -*- coding: iso-8859-15 -*-
import os, sys
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#import torchaudio
import torch
#from speaking import speak
# globals
context = ""
model = None
qa = pipeline("question-answering") # set up the model pipeline once
queries = { # set up dictioanry to hold questions
"Question01": "How old are the Albuquerque volcanoes?",
"Question02": "Where does andesitic compositions exist?",
"Question03": "What is Tephrochronology?",
"Question04": "Do volcanoes exist on other planets?",
"Question05": "What type of volcano is Cerro Verde?",
"Question06": "How old is the oldest rock in Albuqueruqe?",
"Question07": "What does the gradual weathering of the loose cinders create?",
"Question08": "What causes fissure eruption of the Albuquerque Volcanoes?",
"Question09": "What is the name of the composite volcanoe that is 2.5 million years old?",
"Question10": "What are the gas bubble in basalt called?",
"Question11": "What do rocks in the Palomas Volcanic Field consist of?",
"Question12": "What are the volcanic features of the Palomas Volcanic Field?",
"Question13": "Where is the ZUNI-BANDERA FIELD AND MCCARTY'S LAVA FLOW located?",
"Question14": "What surface is fairly common to many pahoehoe flows?",
"Question15": "How far does the The Navajo Dine volcanic field extend?",
"Question16": "How was the Valles Caldera formed?",
"Question17": "What is the largest volcanic field within the Rio Grande rift?",
"Question18": "What geologic feature is located in the central part of Charette Mesa, northwest of Wagon Mound?",
"Question19": "Where did the most recent volcictivity occur in New Mexico?",
"Question20": "what are the floors of the Jornada caves covered with?",
"Question21": "The Albuquerque Volcanoes we see today are are the result of what?",
"Question22": "In what year did the eruption of Mount Pinatubo occur?",
"Question23": "How does volcanic ash form?",
"Question24": "What are volocanic mudflows called?",
"Question25": "What volcano is located about 33 miles east of Raton?"
}
def query(question):
# Generating an answer to the question in context
# qa = pipeline("question-answering")
answer = qa(question=question, context=context)
# save the model
#torch.save(qa.model.state_dict(), "C:\\tmp")
# Print the answer
print("{0}: ".format(question))
print(f"Answer: '{answer['answer']}' with score {answer['score']}")
def main():
global qa, context
# get the volcano corpus
with open('volcanic.corpus', encoding="utf8") as file:
context = file.read().replace('\n', '')
# process each query
for value in queries.values():
answer = query(value)
if __name__ == "__main__":
main()

I am doing a voice Assistant with python and got an error can you help me with this?

# Import packages
import speech_recognition as sr
import os
from gtts import gTTS
import datetime
import warnings
import calendar
import random
import wikipedia
# Ignore any warning messages
warnings.filterwarnings('ignore')
# Record audio and return as string
def recordAudio():
# record the audio
r = sr.Recognizer() # creating a speech recogniser
# open the mic and start recording
with sr.Microphone() as source:
print('Say Something:')
audio = r.listen(source)
# Use google speech recognition
data = ''
try:
data = r.recognize_google(audio)
print('You said : ' + data)
except sr.UnknownValueError:
print('Google Speech recognition not undstand you, unknown error')
except sr.RequestError as e:
print('Request results from google speechrecognition service error' + e)
return data
recordAudio()
# A fuction to get the virtual assistant response
def assistantResponse(text):
print(text)
# convert the text to speech
myobj = gTTS(text=text, lang='en', slow=False)
# save the audio to a file
myobj.save('assistant_response.mp3')
# SPlay the converted file
os.system('start assistant_response.mp3')
# A function for wake word
def wakeWord(text):
WAKE_WORDS = {'hey bongo', 'hey man', 'hey computer', 'okay google'} # list of wake words
text = text.lower() # convert the text to lower case words
# Check to see if user input is a wake eord
for phrase in WAKE_WORDS:
if phrase in text:
return True
# If The wake word is not found in text from the loop , soo it returns False
return False
# A function to get the current date
def getDate():
now = datetime.datetime.now()
my_date = datetime.datetime.today()
weekday = calendar.day_name[my_date.weekday()] # sunday
monthNum = now.month
dayNum = now.day
# A list of months
month_names = ['January', 'February', 'March', ' April', 'May', 'June', 'July', 'August', 'September', ' October',
'November', 'December']
# A list of ordinal Numbers
ordinalNumbers = ['1st', '2nd', '3rd', ' 4th', '5th', '6th', '7th', '8th', '9th', '10th', '11th', '12th', '13th',
'14th', '15th', '16th',
'17th', '18th', '19th', '20th', '21st', '22nd', '23rd', '24rd', '25th', '26th', '27th', '28th',
'29th', '30th', '31st']
return 'Today is ' + weekday + ' ' + month_names[monthNum - 1] + ' the ' + ordinalNumbers[dayNum - 1] + ' .'
# Fuction to return greeting
def greeting(text):
# Greeting inputs
GREETING_INPUTS = ['hi', 'hey', 'hola', 'wassup', 'hello']
# Greeting response
GREETING_RESPONSES = ['howdy', 'all that good', 'hello master', 'heythere']
# If users input is a greeting, then return a randomly chosen greetng response
for word in text.split():
if word.lower() in GREETING_INPUTS:
return random.choice(GREETING_RESPONSES) + '.'
# If no greeting was detected
return ''
# A functions to get persons name from text
def getPerson(text):
wordList = text.split() # splits the text to words
for i in range(0, len(wordList)):
if i + 3 <= len(wordList) - 1 and wordList[i].lower() == 'who' and wordList[i + 1].lower() == 'is':
return wordList[i + 2] + ' ' + wordList[i + 3]
while True:
# record the audio
text = recordAudio()
response = ''
# check for the wake word / phrase
if (wakeWord(text) == True):
# check for greetings by the user
response = response + greeting(text)
# check to see if the user has said anything about data
if ('date' in text):
get_date = getDate()
response = response + ' ' + get_date
# check to see if the user said 'who is'
if ('who is' in text):
person = getPerson(text)
wiki = wikipedia.summary(person, sentences=2)
response = response + ' ' + wiki
# assistant respond back using audio and text from response
assistantResponse(response)
below is the error
C:\Users\90551\PycharmProjects\YazılımBilimi\venv\Scripts\python.exe C:/Users/90551/PycharmProjects/YazılımBilimi/voiceassistan.py
Traceback (most recent call last):
File "C:\Users\90551\PycharmProjects\YazılımBilimi\venv\lib\site-packages\speech_recognition\__init__.py", line 108, in get_pyaudio
import pyaudio
ModuleNotFoundError: No module named 'pyaudio'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/90551/PycharmProjects/YazılımBilimi/voiceassistan.py", line 37, in <module>
recordAudio()
File "C:/Users/90551/PycharmProjects/YazılımBilimi/voiceassistan.py", line 21, in recordAudio
with sr.Microphone() as source:
File "C:\Users\90551\PycharmProjects\YazılımBilimi\venv\lib\site-packages\speech_recognition\__init__.py", line 79, in __init__
self.pyaudio_module = self.get_pyaudio()
File "C:\Users\90551\PycharmProjects\YazılımBilimi\venv\lib\site-packages\speech_recognition\__init__.py", line 110, in get_pyaudio
raise AttributeError("Could not find PyAudio; check installation")
AttributeError: Could not find PyAudio; check installation
Process finished with exit code 1
I installed PyAudio after it says "Download PyAudio" but it still says download PyAudio.
What module or packages should I download more??.
I do not debug it ı only installed PyAudio.
This is all of the information that I know.
İf I will try to give more information if you can't figure out how to solve it:).
Thanks all Stackoverflow community for help.
sorry I didn't do this sooner, but start with copy and pasting this into terminal
pip install PyAudio
if this doesn't work don't worry it happened to me aswell,
you will have to download home-brew as this allows us to download portaudio, copy and paste this into terminal.
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)"
this will download Homebrew.
Once this is downloaded then copy and paste this into terminal
brew install portaudio
portaudio allows us to download pyaudio.
then try installing the package pyaudio through pycharm or whatever coding program you use.
If you try to install PyAudio using pip, you might get an error, so try to install it using pipwin
Install pipwin by,
pip install pipwin
Then install PyAudio by,
pipwin install PyAudio

TypeError: expected string or bytes-like object Tqdm

I have gone through several sources in stackoverflow regarding this issue, but couldn't resolve it.
Thus I have posted it here, kindly resolve.
# Combining all the above statemennts
from tqdm import tqdm
Other_skill = []
# tqdm is for printing the status bar
for sentance in tqdm(project_data['Other skills'].values):
sent = decontracted(sentance)
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
sent = ' '.join(e for e in sent.split() if e not in stopwords)
Other_skill.append(sent.lower().strip())
Error:
TypeError Traceback (most recent call last)
<ipython-input-12-30687b6f17e1> in <module>()
4 # tqdm is for printing the status bar
5 for sentance in tqdm(project_data['Other skills'].values):
----> 6 sent = decontracted(sentance)
7 sent = sent.replace('\\r', ' ')
8 sent = sent.replace('\\"', ' ')
<ipython-input-7-a344e4b38b78> in decontracted(phrase)
4 def decontracted(phrase):
5 # specific
----> 6 phrase = re.sub(r"won't", "will not", phrase)
7 phrase = re.sub(r"can\'t", "can not", phrase)
8
C:\ProgramData\Anaconda3\lib\re.py in sub(pattern, repl, string, count, flags)
189 a callable, it's passed the match object and must return
190 a replacement string to be used."""
--> 191 return _compile(pattern, flags).sub(repl, string, count)
192
193 def subn(pattern, repl, string, count=0, flags=0):
TypeError: expected string or bytes-like object
I think that the brackets in the values() is needed. If i am correct project_data is an dictionary and you missed the brackets in values
from tqdm import tqdm
Other_skill = []
# tqdm is for printing the status bar
# the values must have round brackets
for sentance in tqdm(project_data['Other skills'].values()):
sent = decontracted(sentance)
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
sent = ' '.join(e for e in sent.split() if e not in stopwords)
Other_skill.append(sent.lower().strip())
Looking at the stacktrace we can see that sub is given a wrong input in decontracted.
Best would be to place a breakpoint where the arrow is to check the value of phrase:
4 def decontracted(phrase):
5 # specific
----> 6 phrase = re.sub(r"won't", "will not", phrase)
If you do not know how to do so, you can add some debugging code like so:
def decontracted(phrase):
# specific
print(f'phrase: {phrase}\ttype: {type(phrase)}')
phrase = re.sub(r"won't", "will not", phrase)
[...]
This will print each phrase given to decontracted and its type. There should only be bytes or str. If not then you find your bug and can correct it accordingly.
I hope it helps, we do not have enough information to help you more.

rdd.first() does not give an error but rdd.collect() does

I am working in pyspark and have the following code, where I am processing tweet and making an RDD with the user_id and text. Below is the code
"""
# Construct an RDD of (user_id, text) here.
"""
import json
def safe_parse(raw_json):
try:
json_object = json.loads(raw_json)
if 'created_at' in json_object:
return json_object
else:
return;
except ValueError as error:
return;
def get_usr_txt (line):
tmp = safe_parse (line)
return ((tmp.get('user').get('id_str'),tmp.get('text')));
usr_txt = text_file.map(lambda line: get_usr_txt(line))
print (usr_txt.take(5))
and the output looks okay (as shown below)
[('470520068', "I'm voting 4 #BernieSanders bc he doesn't ride a CAPITALIST PIG adorned w/ #GoldmanSachs $. SYSTEM RIGGED CLASS WAR "), ('2176120173', "RT #TrumpNewMedia: .#realDonaldTrump #America get out & #VoteTrump if you don't #VoteTrump NOTHING will change it's that simple!\n#Trump htt…"), ('145087572', 'RT #Libertea2012: RT TODAY: #Colorado’s leading progressive voices to endorse #BernieSanders! #Denver 11AM - 1PM in MST CO State Capitol…'), ('23047147', '[VID] Liberal Tears Pour After Bernie Supporter Had To Deal With Trump Fans '), ('526506000', 'RT #justinamash: .#tedcruz is the only remaining candidate I trust to take on what he correctly calls the Washington Cartel. ')]
However, as soon as I do
print (usr_txt.count())
I get an error like below
Py4JJavaError Traceback (most recent call last)
<ipython-input-60-9dacaf2d41b5> in <module>()
8 usr_txt = text_file.map(lambda line: get_usr_txt(line))
9 #print (usr_txt.take(5))
---> 10 print (usr_txt.count())
11
/usr/local/spark/python/pyspark/rdd.py in count(self)
1054 3
1055 """
-> 1056 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
1057
1058 def stats(self):
What am I missing? Is the RDD not created properly? or there is something else? how do I fix it?
You have returned None from safe_parse method when there is no created_at element in the parsed json line or when there is an error in parsing. This created error while getting elements from the parsed jsons in (tmp.get('user').get('id_str'),tmp.get('text')). That caused the error to occur
The solution is to check for None in get_usr_txt method
def get_usr_txt (line):
tmp = safe_parse(line)
if(tmp != None):
return ((tmp.get('user').get('id_str'),tmp.get('text')));
Now the question is why print (usr_txt.take(5)) showed the result and print (usr_txt.count()) caused the error
Thats because usr_txt.take(5) considered only the first five rdds and not the rest and didn't have to deal with None datatype.

Categories

Resources