The function - parse_url always works fine if we working with spark-sql throw sql-client (via thrift server), IPython, pyspark-shell, but it doesn't work throw spark-submit mode:
/opt/spark/bin/spark-submit --driver-memory 4G --executor-memory 8G main.py
The error is:
Traceback (most recent call last):
File "/home/spark/***/main.py", line 167, in <module>
)v on registrations.ga = v.ga and reg_path = oldtrack_page and registration_day = day_cl_log and date_cl_log <= registration_date""")
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/context.py", line 552, in sql
File "/opt/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 40, in deco
pyspark.sql.utils.AnalysisException: undefined function parse_url;
Build step 'Execute shell' marked build as failure
Finished: FAILURE
So, we are using workaround here:
def python_parse_url(url, que, key):
import urlparse
ians = None
if que == "QUERY":
ians = urlparse.parse_qs(urlparse.urlparse(url).query)[key][0]
elif que == "HOST":
ians = urlparse.urlparse(url).hostname
elif que == "PATH":
ians = urlparse.urlparse(url).path
return ians
def dc_python_parse_url(url, que, key):
ians = None
try:
ians = python_parse_url(url, que, key)
except:
pass
return ians
sqlCtx.registerFunction('my_parse_url', dc_python_parse_url)
Please, any help with this issue?
Spark >= 2.0
Same as below, but use SparkSession with Hive support enabled:
SparkSession.builder.enableHiveSupport().getOrCreate()
Spark < 2.0
parse_url is not a classic sql function. It is a Hive UDF and as such requires HiveContext to work:
from pyspark import SparkContext
from pyspark.sql import HiveContext, SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
hivContext = HiveContext(sc)
query = """SELECT parse_url('http://example.com/foo/bar?foo=bar', 'HOST')"""
sqlContext.sql(query)
## Py4JJavaError Traceback (most recent call last)
## ...
## AnalysisException: 'undefined function parse_url;'
hivContext.sql(query)
## DataFrame[_c0: string]
Related
I have wrote a code to monitor ram memory usage, CPU memory usage, and CPU temperature I have used both psutil and WMI and I'm some kind of problem the code ran perfectly when I had windows 10 I updated to window 11 it is not working. I have checked python interpreter it is on 3.10
I get this my output/error message:
C:\Users\jeries\PycharmProjects\PP1\venv\Scripts\python.exe C:/Users/jeries/PycharmProjects/PP1/study.py
The CPU usage is: 47.1
RAM memory % used: 54.0
Traceback (most recent call last):
File "C:\Users\jeries\PycharmProjects\PP1\venv\lib\site-packages\wmi.py", line 880, in query
return self._namespace.query(wql, self, fields)
File "C:\Users\jeries\PycharmProjects\PP1\venv\lib\site-packages\wmi.py", line 1072, in query
return [ _wmi_object(obj, instance_of, fields) for obj in self._raw_query(wql) ]
File "C:\Users\jeries\PycharmProjects\PP1\venv\lib\site-packages\wmi.py", line 1072, in <listcomp>
return [ _wmi_object(obj, instance_of, fields) for obj in self._raw_query(wql) ]
File "C:\Users\jeries\PycharmProjects\PP1\venv\lib\site-packages\win32com\client\dynamic.py", line 324, in __getitem__
return self._get_good_object_(self._enum_.__getitem__(index))
File "C:\Users\jeries\PycharmProjects\PP1\venv\lib\site-packages\win32com\client\util.py", line 41, in __getitem__
return self.__GetIndex(index)
File "C:\Users\jeries\PycharmProjects\PP1\venv\lib\site-packages\win32com\client\util.py", line 62, in __GetIndex
result = self._oleobj_.Next(1)
pywintypes.com_error: (-2147217372, 'OLE error 0x80041024', None, None)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\jeries\PycharmProjects\PP1\study.py", line 30, in <module>
temperature_infos = w.Sensor()
File "C:\Users\jeries\PycharmProjects\PP1\venv\lib\site-packages\wmi.py", line 882, in query
handle_com_error()
File "C:\Users\jeries\PycharmProjects\PP1\venv\lib\site-packages\wmi.py", line 258, in handle_com_error
raise klass(com_error=err)
wmi.x_wmi: <x_wmi: Unexpected COM Error (-2147217372, 'OLE error 0x80041024', None, None)>
Process finished with exit code 1
I have tried this:
w = wmi.WMI(namespace="root\openHardwareMonitor")
temperature_infos = w.Sensor()
for sensor in temperature_infos:
if sensor.SensorType == u'Temperature':
print(sensor.Name)
print(sensor.Value)
not working it says that the w.Senosor() "no documentation found"
this is my current code:
import os
import psutil
import wmi
def avg(value_list):
num = 0
length = len(value_list)
for val in value_list:
num += val
return num / length
# Calling psutil.cpu_precent() after 2 seconds
print('The CPU usage is: ', psutil.cpu_percent(2))
print('RAM memory % used:', psutil.virtual_memory()[2])
# have the open hardware monitor opened
w = wmi.WMI(namespace="root\\OpenHardwareMonitor")
sensors = w.Sensor()
cpu_temps = []
gpu_temp = 0
for sensor in sensors:
if sensor.SensorType == u'Temperature' and not 'GPU' in sensor.Name:
cpu_temps += [float(sensor.Value)]
elif sensor.SensorType == u'Temperature' and 'GPU' in sensor.Name:
gpu_temp = sensor.Value
print("Avg CPU: {}".format(avg(cpu_temps)))
print("GPU: {}".format(gpu_temp))
OpenHardwareMonitor can generate logs (options/log Sensors)
The log is called OpenHardwareMonitorLog-YYYY-MM-DD.csv
The idea is therefore to launch OpenHardwareMonitor beforehand (possible to execute via scheduled task + subprocess, or in automatic execution at startup), and to retrieve the correct column in the last line of the file:
#Code
from datetime import date
while 1 == 1:
#Génère le nom du log
now = date.today()
infile = r"C:\OpenHardwareMonitor\OpenHardwareMonitorLog-" + now.strftime("%Y-%m-%d") + ".csv"
#Ouvre en lecture seule
with open(infile, "r") as f:
f = f.readlines()[-1] #Lis la dernière ligne
output = f.split(',') # Sépare via les ","
print(output[10]) # 10 = Colonne T°CPU Core #1
edit:
You will have to find your column number by looking at the log, it's 10 for me, but it must be able to change depending on your config...
I'm just starting, the script should be able to be improved by scanning the first 2 lines and determining the correct column with its name ;-)
from unittest import TestCase
from pyspark.sql import SparkSession
from lib.utils import load_survey_df, count_by_country
class UtilsTestCase(TestCase):
#classmethod
def create_testing_pyspark_session(cls):
return SparkSession.builder.master('local[2]').appName('my - local - testing - pyspark - context').getOrCreate()
#classmethod
def SetUpClass(cls) -> None:
cls.spark = cls.create_testing_pyspark_session()
def test_datafile_loading(self):
sample_df = load_survey_df(self.spark, "data/sample.csv")
result_count = sample_df.count()
self.assertEqual(result_count, 9, "Record count should be 9")
def test_country_count(self):
sample_df = load_survey_df(self.spark, "data/sample.csv")
count_list = count_by_country(sample_df).collect()
count_dict = dict()
for row in count_list:
count_dict[row["Country"]] = row["count"]
self.assertEqual(count_dict["United States"], 4, "Count for United States should be 4")
self.assertEqual(count_dict["Canada"], 2, "Count for United States should be 2")
self.assertEqual(count_dict["United Kingdom"], 1, "Count for United States should be 1")
Hi All,
Can you please tell me what's wrong with this code? I am getting below error:
Error
Traceback (most recent call last):
File "C:\Users\abc\AppData\Local\Continuum\anaconda3\lib\unittest\case.py", line 59, in testPartExecutor
yield
File "C:\Users\abc\AppData\Local\Continuum\anaconda3\lib\unittest\case.py", line 628, in run
testMethod()
File "C:\Users\abc\PycharmProjects\HelloSpark\lib\test_utils.py", line 17, in test_datafile_loading
sample_df = load_survey_df(self.spark, "data/sample.csv")
AttributeError: 'UtilsTestCase' object has no attribute 'spark'
I see that within your SetUpClass method you are using cls.spark,
you need to declare is as attribute in class UtilsTestCase.
just adding spark after your class to your code as follows should fix it:
class UtilsTestCase(TestCase):
spark
#classmethod
def create_testing_pyspark_session(cls):
return SparkSession.builder.master('local[2]').appName('my - local - testing - pyspark - context').getOrCreate()
#classmethod
def SetUpClass(cls) -> None:
cls.spark = cls.create_testing_pyspark_session()
I want to work some example codes from github(https://github.com/kaantas/spark-twitter-sentiment-analysis). I follow steps below;
Started zkserver
Started kafka 2.5.0 version (also i am using apache spark 3.0.0 and jdk 8)
Started tweeetlistener.py (tweets start to stream, i can see the tweet cmd window)
I open the twitter_topic_avg_sentiment_val.py with Spyder and it just shows bottom text
Note: i dont know any idea about jars, if i will use external jar, please explaing how?
THANKS A LOT...
Traceback (most recent call last):
File "C:\Users\merha\Desktop\spark-twitter-sentiment-analysis-master\twitter_topic_avg_sentiment_val.py", line 40, in <module>
query.awaitTermination()
File "C:\Anaconda3\lib\site-packages\pyspark\sql\streaming.py", line 103, in awaitTermination
return self._jsq.awaitTermination()
File "C:\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "C:\Anaconda3\lib\site-packages\pyspark\sql\utils.py", line 137, in deco
raise_from(converted)
File "<string>", line 3, in raise_from
StreamingQueryException: org/apache/spark/kafka010/KafkaConfigUpdater
=== Streaming Query ===
Identifier: [id = f5dd9cb5-fcea-42ec-a20e-93a2ad233e1f, runId = 6cffdd89-3792-4500-a508-e4abc76425fb]
Current Committed Offsets: {}
Current Available Offsets: {}
Current State: INITIALIZING
Thread State: RUNNABLE
------------------<<<<<<<<<<<<<<<<<<tweet_listener.py>>>>------------------------
from tweepy import Stream
from tweepy.streaming import StreamListener
import json
import twitter_config
import pykafka
from afinn import Afinn
import sys
from sys import exit
class TweetListener(StreamListener):
def __init__(self):
self.client = pykafka.KafkaClient("localhost:9092")
self.producer = self.client.topics[bytes('twitter3','ascii')].get_producer()
def on_data(self, data):
try:
json_data = json.loads(data)
send_data = '{}'
json_send_data = json.loads(send_data)
json_send_data['text'] = json_data['text']
json_send_data['senti_val']=afinn.score(json_data['text'])
print(json_send_data['text'], " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ", json_send_data['senti_val'])
self.producer.produce(bytes(json.dumps(json_send_data),'ascii'))
return True
except KeyError:
return True
def on_error(self, status):
print(status)
return True
consumer_key = "xxxxxxxxxx"
consumer_secret = "xxxxxxxxxxx"
access_token = "xxxxxxxxxxxx"
access_secret = "xxxxxxxxxx"
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
# create AFINN object for sentiment analysis
afinn = Afinn()
twitter_stream = Stream(auth, TweetListener())
twitter_stream.filter(languages=['en'], track=["big data"])
----------------------<<<twitter_topic_avg_sentiment_val.py>>>>>>---------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import json
import sys
from pyspark.sql.types import *
def fun(avg_senti_val):
try:
if avg_senti_val < 0: return 'NEGATIVE'
elif avg_senti_val == 0: return 'NEUTRAL'
else: return 'POSITIVE'
except TypeError:
return 'NEUTRAL'
if __name__ == "__main__":
schema = StructType([
StructField("text", StringType(), True),
StructField("senti_val", DoubleType(), True)
])
spark = SparkSession.builder.appName("TwitterSentimentAnalysis") .getOrCreate()
kafka_df = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "twitter3").option("startingOffsets", "earliest").load()
kafka_df_string = kafka_df.selectExpr("CAST(value AS STRING)")
tweets_table = kafka_df_string.select(from_json(col("value"), schema).alias("data")).select("data.*")
sum_val_table = tweets_table.select(avg('senti_val').alias('avg_senti_val'))
# udf = USER DEFINED FUNCTION
udf_avg_to_status = udf(fun, StringType())
# avarage of senti_val column to status column
new_df = sum_val_table.withColumn("status", udf_avg_to_status("avg_senti_val"))
query = kafka_df_string.writeStream.format("console").option("truncate","false").start()
query.awaitTermination()```
after I downloaded and copy this jar file
spark-token-provider-kafka-0-10
to spark jars folder (or add it to Spark_CLASSPATH), my problem resolved.
Have you ever submitted spark with kafka package as a configuration? See the third line.
spark-submit --master yarn --deploy-mode cluster \
--py-files "${PY_ZIP}" \
--packages "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1" \
Below is my python code I read the keys from a CSV file and delete them in the database.It's running fine for a while and throwing me this timeout error. I don't see any GC issue and health of the node is working fine.
Traceback (most recent call last):
File "/Users/XXX/Downloads/XXX/XXX", line 65, in <module>
parse_file(datafile)
File "/Users/XXX/Downloads/XXX/XXX", line 49, in parse_file
session = cluster.connect('XXX')
File "cassandra/cluster.py", line 1193, in cassandra.cluster.Cluster.connect (cassandra/cluster.c:17796)
File "cassandra/cluster.py", line 1240, in cassandra.cluster.Cluster._new_session (cassandra/cluster.c:18952)
File "cassandra/cluster.py", line 1980, in cassandra.cluster.Session.__init__ (cassandra/cluster.c:35191)
cassandra.cluster.NoHostAvailable: ("Unable to connect to any servers using keyspace 'qualys_ioc'", ['127.0.0.1'])
Python Code:
import argparse
import sys
import itertools
import codecs
import uuid
import os
import subprocess
try:
import cassandra
import cassandra.concurrent
except ImportError:
sys.exit('Python Cassandra driver not installed. You might try \"pip install cassandra-driver\".')
from cassandra.cluster import Cluster, ResultSet, Session
from cassandra.policies import DCAwareRoundRobinPolicy
from cassandra.auth import PlainTextAuthProvider
from cassandra.cluster import ConsistencyLevel
from cassandra import ReadTimeout
datafile = "/Users/XXX/adf.csv"
if os.path.exists(datafile):
os.remove(datafile)
def dumptableascsv():
os.system(
"sh /Users/XXX/Documents/dse-5.0.14/bin/cqlsh 127.0.0.1 9042 -u cassandra -p cassandra -e \" COPY XXX.agent_delta_fragment(agent_id,delta_id ,last_fragment_id ,processed) TO \'/Users/XXX/adf.csv\' WITH HEADER = true;\"\n"
" ")
#print datafile
def parse_file(datafile):
global fields
data = []
with open(datafile, "rb") as f:
header = f.readline().split(",")
# Loop through remaining lines in file object f
for line in f:
fields = line.split(",") # Split line into list
#print fields[3]
if fields[3]:
print "connect"
print fields[0],fields[1],fields[2],fields[3]
auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
cluster = Cluster(['127.0.0.1'],
load_balancing_policy=DCAwareRoundRobinPolicy(local_dc='Cassandra'),
port=9042, auth_provider=auth_provider, connect_timeout=10000,)
session = cluster.connect('XXX')
#session = cluster.connect('XXX')
# session.execute("select * from XXX.agent_delta_fragment LIMIT 1")
#rows = session.execute('select agent_id from XXX.agent_delta_fragment LIMIT 1')
#for row in rows:
# print row.agent_id
#batch = BatchStatement("DELETE FROM XXX.agent_delta_fragment_detail_test WHERE agent_id=%s and delta_id=%s and fragment_id=%s", (uuid.UUID(fields[0]), uuid.UUID(fields[1]), int(fields[3])))
session.execute("DELETE FROM XXX.agent_delta_fragment_detail WHERE agent_id=%s and delta_id=%s and fragment_id=%s", (uuid.UUID(fields[0]), uuid.UUID(fields[1]), int(fields[2])), timeout=1000000)
#session.execute(batch)
else:
print fields[3]
print "connect-False"
# print fields[3]
dumptableascsv()
parse_file(datafile)
I am having the same error in python no matter where I put the db.commit() statement.
Here is my code:
from bottle import route, run
import json
import collections
import MySQLdb as db
#route('/register/<COD>/<NOMBRE>/<APELLIDO>/<DIRECCION>/<TEL>/<COD_FAC>', method='PUT')
def registrar(COD,NOMBRE,APELLIDO,DIRECCION,TEL,COD_FAC):
c=db.connect('10.100.70.136','koala','toor','lab2',use_unicode=True)
cur=c.cursor()
sql1='SELECT * FROM alumnos WHERE codigo="'+COD+'";'
cur.execute(sql1)
alumnos=cur.fetchall();
i=0
for alumno in alumnos:
i+=1
print(i)
if i==0:
operationResult=1
operationMessage=""
cur2=c.cursor()
sql2='INSERT INTO alumnos (codigo,nombre,apellido,direccion,telefono,codigoFacultad) VALUES ("'+COD+'","'+NOMBRE+'","'+APELLIDO+'","'+DIRECCION+'","'+TEL+'","'+COD_FAC+'");'
cur2.execute(sql2)
else:
operationResult=2
operationMessage="El alumno con codigo "+COD+" ya se encuentra registrado"
db.commit()
db.close()
results = []
d=collections.OrderedDict()
d['operationResult'] = operationResult
d['operationMessage'] = operationMessage
results.append(d)
j = json.dumps(results)
return j
run(host='localhost',port=8080,debug=True)
The error that I get is this:
AttributeError("'module' object has no attribute 'commit'",)
And the description that I get is the following:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/bottle-0.12.7-py2.7.egg/bottle.py", line 862, in _handle
return route.call(**args)
File "/usr/local/lib/python2.7/dist-packages/bottle-0.12.7-py2.7.egg/bottle.py", line 1729, in wrapper
rv = callback(*a, **ka)
File "tarea.preg4.py", line 33, in registrar
db.commit()
AttributeError: 'module' object has no attribute 'commit'
You want to call commit on the connection object - which is c in your case. (so make it c.commit() instead of db.commit())
You can find the python dbapi connection methods here.