I want to work some example codes from github(https://github.com/kaantas/spark-twitter-sentiment-analysis). I follow steps below;
Started zkserver
Started kafka 2.5.0 version (also i am using apache spark 3.0.0 and jdk 8)
Started tweeetlistener.py (tweets start to stream, i can see the tweet cmd window)
I open the twitter_topic_avg_sentiment_val.py with Spyder and it just shows bottom text
Note: i dont know any idea about jars, if i will use external jar, please explaing how?
THANKS A LOT...
Traceback (most recent call last):
File "C:\Users\merha\Desktop\spark-twitter-sentiment-analysis-master\twitter_topic_avg_sentiment_val.py", line 40, in <module>
query.awaitTermination()
File "C:\Anaconda3\lib\site-packages\pyspark\sql\streaming.py", line 103, in awaitTermination
return self._jsq.awaitTermination()
File "C:\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "C:\Anaconda3\lib\site-packages\pyspark\sql\utils.py", line 137, in deco
raise_from(converted)
File "<string>", line 3, in raise_from
StreamingQueryException: org/apache/spark/kafka010/KafkaConfigUpdater
=== Streaming Query ===
Identifier: [id = f5dd9cb5-fcea-42ec-a20e-93a2ad233e1f, runId = 6cffdd89-3792-4500-a508-e4abc76425fb]
Current Committed Offsets: {}
Current Available Offsets: {}
Current State: INITIALIZING
Thread State: RUNNABLE
------------------<<<<<<<<<<<<<<<<<<tweet_listener.py>>>>------------------------
from tweepy import Stream
from tweepy.streaming import StreamListener
import json
import twitter_config
import pykafka
from afinn import Afinn
import sys
from sys import exit
class TweetListener(StreamListener):
def __init__(self):
self.client = pykafka.KafkaClient("localhost:9092")
self.producer = self.client.topics[bytes('twitter3','ascii')].get_producer()
def on_data(self, data):
try:
json_data = json.loads(data)
send_data = '{}'
json_send_data = json.loads(send_data)
json_send_data['text'] = json_data['text']
json_send_data['senti_val']=afinn.score(json_data['text'])
print(json_send_data['text'], " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ", json_send_data['senti_val'])
self.producer.produce(bytes(json.dumps(json_send_data),'ascii'))
return True
except KeyError:
return True
def on_error(self, status):
print(status)
return True
consumer_key = "xxxxxxxxxx"
consumer_secret = "xxxxxxxxxxx"
access_token = "xxxxxxxxxxxx"
access_secret = "xxxxxxxxxx"
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
# create AFINN object for sentiment analysis
afinn = Afinn()
twitter_stream = Stream(auth, TweetListener())
twitter_stream.filter(languages=['en'], track=["big data"])
----------------------<<<twitter_topic_avg_sentiment_val.py>>>>>>---------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import json
import sys
from pyspark.sql.types import *
def fun(avg_senti_val):
try:
if avg_senti_val < 0: return 'NEGATIVE'
elif avg_senti_val == 0: return 'NEUTRAL'
else: return 'POSITIVE'
except TypeError:
return 'NEUTRAL'
if __name__ == "__main__":
schema = StructType([
StructField("text", StringType(), True),
StructField("senti_val", DoubleType(), True)
])
spark = SparkSession.builder.appName("TwitterSentimentAnalysis") .getOrCreate()
kafka_df = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "twitter3").option("startingOffsets", "earliest").load()
kafka_df_string = kafka_df.selectExpr("CAST(value AS STRING)")
tweets_table = kafka_df_string.select(from_json(col("value"), schema).alias("data")).select("data.*")
sum_val_table = tweets_table.select(avg('senti_val').alias('avg_senti_val'))
# udf = USER DEFINED FUNCTION
udf_avg_to_status = udf(fun, StringType())
# avarage of senti_val column to status column
new_df = sum_val_table.withColumn("status", udf_avg_to_status("avg_senti_val"))
query = kafka_df_string.writeStream.format("console").option("truncate","false").start()
query.awaitTermination()```
after I downloaded and copy this jar file
spark-token-provider-kafka-0-10
to spark jars folder (or add it to Spark_CLASSPATH), my problem resolved.
Have you ever submitted spark with kafka package as a configuration? See the third line.
spark-submit --master yarn --deploy-mode cluster \
--py-files "${PY_ZIP}" \
--packages "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1" \
Related
I am trying to print tweet data but I get an error that I can't fix. When I try and run the code in the docs I still get the same error. Is this a python 3.8 issue?
Code in docs:
for tweet in tweepy.Paginator(client.search_recent_tweets, "Tweepy",
max_results=100).flatten(limit=250):
print(tweet.id)
Stack Trace:
Traceback (most recent call last):
File "scraper.py", line 38, in <module>
main()
File "scraper.py", line 34, in main
for item in paginator:
File "/Users/troy/Desktop/streamlit/env/lib/python3.8/site-packages/tweepy/pagination.py", line 100, in __next__
self.previous_token = response.meta.get("previous_token")
AttributeError: 'Response' object has no attribute 'meta'
My Code:
import tweepy
import requests
import os
import pandas as pd
# global tokens
api_key = os.environ.get('Twitter_API_Key')
api_secret = os.environ.get('Twitter_API_Secret')
access_token = os.environ.get('Twitter_Access_Token')
access_secret = os.environ.get('Twitter_Access_Secret')
bearer = os.environ.get('bearer_token')
def create_client():
client = tweepy.Client( bearer_token=bearer,
return_type=requests.Response,
wait_on_rate_limit=True)
return client
def create_paginator(authenticated_client):
paginator = tweepy.Paginator(
authenticated_client.search_recent_tweets,
query='from:elonmusk',
tweet_fields=['author_id', 'id', 'created_at'],
max_results=100,
limit=5)
return paginator
def main():
client = create_client()
paginator = create_paginator(client)
print(paginator)
for item in paginator:
print(item)
if __name__ == "__main__":
main()
Turns out I needed .flatten(). Don't know why but hey that's show business.
def create_paginator(authenticated_client, query):
paginator = tweepy.Paginator(authenticated_client.search_recent_tweets, query=query,
tweet_fields=['author_id', 'id', 'created_at'],max_results=10).flatten(limit=5)
I have a query that I use to fetch data from cloud firestore .but it throws me a key error. Not sure what is wrong.the code below.
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import pandas as pd
import sys
import os
import fileinput
# Use the application default credentials
cred = credentials.Certificate('creddatabase.json')
firebase_admin.initialize_app(cred)
db = firestore.client()
docs = db.collection(u'testcollection').stream()
users_ref = db.collection(u'testcollection')
docs = users_ref.stream()
array = []
is_header_printed = False
rows_keys = None
out_file = open("testcollection.csv","w")
for doc in docs:
print(doc.id)
row_items = doc.to_dict()
print(doc)
if(is_header_printed == False):
rows_keys = list(row_items.keys())
rows_keys.sort(reverse=False)
#loop through every key and print the header row
for key in rows_keys:
print(key)
out_file.write(str(key.replace(",","_"))+",")
out_file.write("\n")
is_header_printed = True
for key in rows_keys:
print(key)
out_file.write(str(row_items[key]).replace(',',' ')+',')
out_file.write("\n")
print(str(doc.to_dict()['StartDate'].year) + '/' + str(doc.to_dict()['StartDate'].month) + '/' + str(doc.to_dict()['StartDate'].day))
out_file.close()
My document in collection looks like this:
Comments: ""
Objectived: "test"
Media: "test media"
notes: "test notest"
Zone A: 0
Zone B: 0
Error message:
Traceback (most recent call last):
File "/Users/DataScience/Firebasequery/Firebase_query_for_psolzsol_jnhibtionsingalling.py", line 49, in <module>
out_file.write(str(row_items[key]).replace(',',' ')+',')
KeyError: 'Media'
Hi there so I wanted to make a spotify voice assistant so I found a video on youtube and the guy just went through his code and how it works and left the source code on his github so I used that and configured it to work on my settings but i'm getting an attribute error enter with one of his lines and theres 3 files "main.py" "setup.txt" and "pepper.py" but the problem is in main so im gonna drop the code down below
main.py:
import pandas as pd
from speech_recognition import Microphone, Recognizer, UnknownValueError
import spotipy as sp
from spotipy.oauth2 import SpotifyOAuth
from pepper import *
# Set variables from setup.txt
setup = pd.read_csv(r'C:\Users\Yousif\Documents\Python spotify\setup.txt', sep='=', index_col=0, squeeze=True, header=None)
client_id = setup['client_id']
client_secret = setup['client_secret']
device_name = setup['device_name']
redirect_uri = setup['redirect_uri']
scope = setup['scope']
username = setup['username']
# Connecting to the Spotify account
auth_manager = SpotifyOAuth(
client_id=client_id,
client_secret=client_secret,
redirect_uri=redirect_uri,
scope=scope,
username=username)
spotify = sp.Spotify(auth_manager=auth_manager)
# Selecting device to play from
devices = spotify.devices()
deviceID = None
for d in devices['devices']:
d['name'] = d['name'].replace('’', '\'')
if d['name'] == device_name:
deviceID = d['id']
break
# Setup microphone and speech recognizer
r = Recognizer()
m = None
input_mic = 'Voicemod Virtual Audio Device (WDM)' # Use whatever is your desired input
for i, microphone_name in enumerate(Microphone.list_microphone_names()):
if microphone_name == input_mic:
m = Microphone(device_index=i)
while True:
"""
Commands will be entered in the specific format explained here:
- the first word will be one of: 'album', 'artist', 'play'
- then the name of whatever item is wanted
"""
with m as source:
r.adjust_for_ambient_noise(source=source)
audio = r.listen(source=source)
command = None
try:
command = r.recognize_google(audio_data=audio).lower()
except UnknownValueError:
continue
print(command)
words = command.split()
if len(words) <= 1:
print('Could not understand. Try again')
continue
name = ' '.join(words[1:])
try:
if words[0] == 'album':
uri = get_album_uri(spotify=spotify, name=name)
play_album(spotify=spotify, device_id=deviceID, uri=uri)
elif words[0] == 'artist':
uri = get_artist_uri(spotify=spotify, name=name)
play_artist(spotify=spotify, device_id=deviceID, uri=uri)
elif words[0] == 'play':
uri = get_track_uri(spotify=spotify, name=name)
play_track(spotify=spotify, device_id=deviceID, uri=uri)
else:
print('Specify either "album", "artist" or "play". Try Again')
except InvalidSearchError:
print('InvalidSearchError. Try Again')
the exact error is:
Traceback (most recent call last):
File "c:/Users/Yousif/Documents/Python spotify/main.py", line 49, in <module>
with m as source:
AttributeError: __enter__
__enter__ is a python method that allows you to implement objects that can be used easily with the with statement. A useful example could be a database connection object (which then automagically closes the connection once the corresponding 'with'-statement goes out of scope):
class DatabaseConnection(object):
def __enter__(self):
# make a database connection and return it
...
return self.dbconn
def __exit__(self, exc_type, exc_val, exc_tb):
# make sure the dbconnection gets closed
self.dbconn.close()
...
The error here is caused because m = None, and None cannot be used in a with statement.
>>> with None as a:
... print(a)
...
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: __enter__
The function - parse_url always works fine if we working with spark-sql throw sql-client (via thrift server), IPython, pyspark-shell, but it doesn't work throw spark-submit mode:
/opt/spark/bin/spark-submit --driver-memory 4G --executor-memory 8G main.py
The error is:
Traceback (most recent call last):
File "/home/spark/***/main.py", line 167, in <module>
)v on registrations.ga = v.ga and reg_path = oldtrack_page and registration_day = day_cl_log and date_cl_log <= registration_date""")
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/context.py", line 552, in sql
File "/opt/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 40, in deco
pyspark.sql.utils.AnalysisException: undefined function parse_url;
Build step 'Execute shell' marked build as failure
Finished: FAILURE
So, we are using workaround here:
def python_parse_url(url, que, key):
import urlparse
ians = None
if que == "QUERY":
ians = urlparse.parse_qs(urlparse.urlparse(url).query)[key][0]
elif que == "HOST":
ians = urlparse.urlparse(url).hostname
elif que == "PATH":
ians = urlparse.urlparse(url).path
return ians
def dc_python_parse_url(url, que, key):
ians = None
try:
ians = python_parse_url(url, que, key)
except:
pass
return ians
sqlCtx.registerFunction('my_parse_url', dc_python_parse_url)
Please, any help with this issue?
Spark >= 2.0
Same as below, but use SparkSession with Hive support enabled:
SparkSession.builder.enableHiveSupport().getOrCreate()
Spark < 2.0
parse_url is not a classic sql function. It is a Hive UDF and as such requires HiveContext to work:
from pyspark import SparkContext
from pyspark.sql import HiveContext, SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
hivContext = HiveContext(sc)
query = """SELECT parse_url('http://example.com/foo/bar?foo=bar', 'HOST')"""
sqlContext.sql(query)
## Py4JJavaError Traceback (most recent call last)
## ...
## AnalysisException: 'undefined function parse_url;'
hivContext.sql(query)
## DataFrame[_c0: string]
Below is the code I have been working on.
The very last line write_csv('twitter_gmail.csv', messages, append=True) throws a
[ec2-user#ip-172-31-46-164 ~]$ ./twitter_test16.sh
Traceback (most recent call last):
File "./twitter_test16.sh", line 53, in
write_csv('twitter_gmail.csv', messages, append=True)
NameError: name 'messages' is not defined
I have messages defined so I dont understand why it would do that.
import csv
import json
import oauth2 as oauth
import urllib
import sys
import requests
import time
CONSUMER_KEY = "
CONSUMER_SECRET = "
ACCESS_KEY = "
ACCESS_SECRET = "
class TwitterSearch:
def __init__(self, ckey=CONSUMER_KEY, csecret=CONSUMER_SECRET,
akey=ACCESS_KEY, asecret=ACCESS_SECRET,
query='https://api.twitter.com/1.1/search/tweets.{mode}?{query}'
):
consumer = oauth.Consumer(key=ckey, secret=csecret)
access_token = oauth.Token(key=akey, secret=asecret)
self.client = oauth.Client(consumer, access_token)
self.query = query
def search(self, q, mode='json', **queryargs):
queryargs['q'] = q
query = urllib.urlencode(queryargs)
return self.client.request(self.query.format(query=query, mode=mode))
def write_csv(fname, rows, header=None, append=False, **kwargs):
filemode = 'ab' if append else 'wb'
with open(fname, filemode) as outf:
out_csv = csv.writer(outf, **kwargs)
if header:
out_csv.writerow(header)
out_csv.writerows(rows)
def main():
ts = TwitterSearch()
response, data = ts.search('#gmail.com', result_type='recent')
js = json.loads(data)
messages = ([msg['created_at'], msg['txt'], msg['user']['id']] \
for msg in js.get('statuses', []))
write_csv('twitter_gmail.csv', messages, append=True)
The previous line is missing a parenthesis.
messages = ([msg['created_at'], msg['txt'], msg['user']['id']] for msg in js.get('statuses', [])
Should be:
messages = ([msg['created_at'], msg['txt'], msg['user']['id']] for msg in js.get('statuses', []))
I'm surprised that it works when you change to print? Are you also changing the comprehension when you do that?
You asked why the line number of the error was after the bad syntax?
Try putting this in line one of a file and running it, and note the line of the SyntaxError.
a = (]
Then try this and check out the line number:
a = (
b = "some stuff"
Finally, try this:
a = (
b = "some stuff"
Think about when you would know that the programmer had made a python-illegal typo if you were reading the code and carrying it out via pen and paper.
Basically, a SyntaxError is raised as soon as it can be unambiguously determined that invalid syntax was used, which is often immediately after a statement where a mistake was made, not immediately at.
You'll frequently get line numbers on SyntaxErrors that are a line (or several lines if there's empty lines or a corner case) below the actual typo.