Loading a dataframe with check in pyspark is giving me empty dataframe - python

I am trying to load data in a dataframe using pyspark. The files are in parquet format. I am using the following code
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,BooleanType,DateType,TimestampType,LongType,FloatType,DoubleType,ArrayType,ShortType
from pyspark.sql import HiveContext
from pyspark.sql.functions import lit
import datetime
from pyspark import SparkContext
from pyspark import SQLContext
from datetime import datetime
from datetime import *
from datetime import date, timedelta as td
import datetime
from datetime import datetime
from pyspark import SparkContext
from pyspark.sql import HiveContext
import pandas as pd
daterange = pd.date_range('2019-12-01','2019-12-31')
df = sqlContext.createDataFrame(sc.emptyRDD())
for process_date in daterange:
try:
name = 's3://location/process_date={}'.format(process_date.strftime("%Y-%m-%d"))+'/'
print(name)
x = spark.read.parquet(name)
x = x.withColumn('process_date',lit(process_date.strftime("%Y-%m-%d")))
x.show()
df = df.union(x)
except:
print("File doesnt exist for"+str(process_date.strftime("%Y-%m-%d")))
But when i am running this code,
i am getting the output df is an empty data set and despite having data for some dates, i am getting exception print message in all the date range.
Can anyone guide me what i am doing wrong?

I think the problem is the union and a too broad except clause.
Union will only work if the schemas of the dataframes to be unioned is the same.
Hence emptyDF.union(nonEmtpy) raises an error that you catch in the except clause.

Related

Getting error InvalidBson in pymongo code. How to solve this?

import pymongo
from pymongo import MongoClient
test = MongoClient("mongodb://harikrishna:ch9u2encn4e9hc3n0284hci-032n8c4#35.200.240.21:27017/tb_dev")
testdb = test.tb_dev
import numpy as np
from datetime import datetime
import datetime as dt
import pandas as pd
from bson.objectid import ObjectId
submission = testdb.test_submission.aggregate([
{"$match":{"createdOn":{"$gte":datetime.strptime("2020-11-30T00:00:00UTC","%Y-%m-%dT%H:%M:%S%Z"),
"$lt":datetime.strptime("2021-01-01T00:00:00UTC","%Y-%m-%dT%H:%M:%S%Z")},"isQuiz":False}},
{"$project":{"sid":1,"tid":1,"marks":1,"timeTaken":{"$divide":["$timeTaken",60]},"totalAnswered":1}}],allowDiskUse = True)
submission = pd.DataFrame(list(submission))
submission.head()
Trying to execute this code. But getting InvalidBSON error in submission = pd.DataFrame(list(submission)) line. Help me in decoding this error

Error: column is of type timestamp without time zone but expression is of type double precision

The following code...
import time
import numpy as np
import pyodbc
import datetime
import win32com.client
import pythoncom
import re
from sqlalchemy import create_engine, event
import pandas as pd
import runpy
import codecs
import collections
from multiprocessing import Process, Queue
from datetime import datetime, timezone, timedelta
import threading
import os
import csv
import warnings
import xlsxwriter
from threading import Timer
import psycopg2
warnings.filterwarnings("ignore")
import time
import win32com.client
import threading
from datetime import datetime, timezone, timedelta
import numpy as np
import pyodbc
import datetime
import pandas as pd
import runpy
import codecs
import collections
from multiprocessing import Process, Queue
import threading
from datetime import timedelta
import warnings
from win32com import client
from datetime import datetime
import os
#from docx import Document
#from docx.shared import Inches, Pt, Mm
import time
import numpy as np
import pyodbc
import datetime
import pandas as pd
import runpy
import codecs
import collections
from multiprocessing import Process, Queue
import threading
from datetime import timedelta
from datetime import datetime
import os
import time
import glob
import time
import time
from datetime import datetime, timezone, timedelta
import time
import win32com.client
import threading
from datetime import datetime, timezone, timedelta
import numpy as np
import pyodbc
import datetime
import pandas as pd
import runpy
import codecs
import collections
from multiprocessing import Process, Queue
import threading
from datetime import timedelta
import warnings
from win32com import client
from datetime import datetime
import os
#from docx import Document
#from docx.shared import Inches, Pt, Mm
import time
import numpy as np
import pyodbc
import datetime
import pandas as pd
import runpy
import codecs
import collections
from multiprocessing import Process, Queue
import threading
from datetime import timedelta
import warnings
from win32com import client
from datetime import datetime
import os
import time
import glob
import time
import time
from datetime import datetime, timezone, timedelta
warnings.filterwarnings("ignore")
kronos_df = pd.read_excel(r"\\teslamotors.com\us\Public\stalamakki\ExcelFiles\KronosDataHourlyRefresh.xls")
kronos_df.fillna('')
clockRecords = kronos_df.to_dict('records')
sqlUpsert = """
INSERT INTO "daily_performance_metrics"."employee_kronos_data_2"
VALUES (%s,%s,%s,%s,%s)
"""
# VALUES (%s,to_timestamp(%s, 'YY-MM-DD HH24:MI'),COALESCE(to_timestamp(NULLIF(%s, '01/01/01 00:00'),'MM/DD/YY hh24:mi')),%s,%s)
#sqlDelete = """
# DELETE FROM "daily_performance_metrics"."employee_kronos_data" WHERE CustomerName='Alfreds Futterkiste';
postgres_conn = psycopg2.connect("host=sjc04p1scadb02.teslamotors.com dbname=service_warehouse_metrics user=service_warehouse_rw port=5432 password=gvjY96LcnWn2B3+obVjFsLG5erMy/4JNxgN00Lnq2n0=")
postgres_cursor = postgres_conn.cursor()
for record in clockRecords:
if record['ShiftEnd'] == '':
record['ShiftEnd'] = None
if record['ShiftStart'] == '':
record['ShiftStart'] = None
postgres_cursor.execute(sqlUpsert,list(record.values()))
postgres_conn.commit()
postgres_cursor.close()
postgres_conn.close()
...generates this error message when it tries to write what I assume is the first record with a null value...
---------------------------------------------------------------------------
DatatypeMismatch Traceback (most recent call last)
<ipython-input-73-2ef7c8c3820c> in <module>()
15 if record['ShiftStart'] == 'NaN':
16 record['ShiftStart'] = None
---> 17 postgres_cursor.execute(sqlUpsert,list(record.values()))
18 postgres_conn.commit()
19 postgres_cursor.close()
DatatypeMismatch: column "shift_end" is of type timestamp without time zone but expression is of type double precision
LINE 3: ... VALUES ('zvolkert','10/02/19 13:13','NaN'::flo...
^
HINT: You will need to rewrite or cast the expression.
To deal with the Nulls, I've tried this syntax for the INSERT statement...
INSERT INTO "daily_performance_metrics"."employee_kronos_data_2"
VALUES (%s,to_timestamp(%s, 'YY-MM-DD HH24:MI'),COALESCE(to_timestamp(NULLIF(%s, '01/01/01 00:00'),'MM/DD/YY hh24:mi')),%s,%s)
...which generates this error message...
InvalidTextRepresentation: invalid input syntax for type double precision: "01/01/01 00:00"
LINE 3: ...4:MI'),COALESCE(to_timestamp(NULLIF('NaN'::float, '01/01/01 ...
I'm assuming this is a very simple syntax mistake. Would really appreciate if someone could tell me the correct syntax for getting these strings and null values into the timestamp fields.
I'm writing to this table...
CREATE TABLE daily_performance_metrics.employee_kronos_data_5 (
file_number TEXT
,shift_start TIMESTAMP
,shift_end TIMESTAMP
,job_category TEXT
,job_name TEXT
)
Here's the file that i'm trying to copy to the database:
ClockInOutRecords.xlsx
Solution was simply to replace the nan values with none. I thought I was already doing that but the == 'NaN' syntax was incorrect. The correct syntax for checking for NaN is pd.isna. Fixed code below...
sqlUpsert = """
INSERT INTO "daily_performance_metrics"."employee_kronos_data_2"
VALUES (%s,COALESCE(to_timestamp(%s, 'MM/DD/YY hh24:mi')),COALESCE(to_timestamp(%s, 'MM/DD/YY hh24:mi')),%s,%s)
"""
# VALUES (%s,%s,%s,%s,%s)
# VALUES (%s,to_timestamp(%s, 'YY-MM-DD HH24:MI'),COALESCE(to_timestamp(NULLIF(%s, '01/01/01 00:00'),'MM/DD/YY hh24:mi')),%s,%s)
postgres_conn = psycopg2.connect("host=sjc04p1scadb02.teslamotors.com dbname=service_warehouse_metrics user=service_warehouse_rw port=5432 password=gvjY96LcnWn2B3+obVjFsLG5erMy/4JNxgN00Lnq2n0=")
postgres_cursor = postgres_conn.cursor()
postgres_cursor.execute(sqlDelete)
for record in clockRecords:
# print(list(record.values()))
if pd.isna(record['ShiftEnd']):
# print(record['ShiftEnd'])
record['ShiftEnd'] = None
# print(record['ShiftEnd'])
if pd.isna(record['Job_Category']):
record['Job_Category'] = None
if pd.isna(record['Job_Name']):
record['Job_Name'] = None
# print(list(record.values()))
postgres_cursor.execute(sqlUpsert,list(record.values()))
# print('success')

Textblob return neutral result for all negative comments, using aws glue pyspark job

I am facing an issue with textblob sentimental analysis. I have written code in pyspark that runs on aws glue spark job. The issue is it returns neutral result for negative comments. I have tried the same code on lambda(with python2.7,python3.6, and python 3.7 ), aws glue pythonshell job, and on my local machine, library works fine on them. but giving neutral results on aws glue spark job.
this code will get the return the result against comments
import sys
import boto3
import json
import logging
import sys
import uuid
import json
import imp
import sys
sys.modules["sqlite"] = imp.new_module("sqlite")
sys.modules["sqlite3.dbapi2"] = imp.new_module("sqlite.dbapi2")
from textblob import TextBlob
import re
import datetime
from awsglue.utils import getResolvedOptions
from awsglue.transforms import *
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql import Row
from collections import OrderedDict
boto3.setup_default_session(region_name='us-east-1')
def clean_tweet(tweet):
'''
Utility function to clean tweet text by removing links, special
characters
using simple regex statements.
'''
return ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", tweet).split())
def get_tweet_sentiment(tweet):
'''
Utility function to classify sentiment of passed tweet
using textblob's sentiment method
print("s",tweet)
analysis = TextBlob(clean_tweet(tweet))
print(analysis.sentiment.polarity)
if analysis.sentiment.polarity > 0:
return 'positive'
elif analysis.sentiment.polarity == 0:
return 'neutral'
else:
return 'negative'

spark, cassandra, streaming, python, error, database, kafka

im trying to save my streaming data from spark to cassandra, spark is conected to kafka and its working ok, but saving to cassandra its making me become crazy. Im using spark 2.0.2, kafka 0.10 and cassandra 2.23,
this is how im submiting to spark
spark-submit --verbose --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0 --jars /tmp/pyspark-cassandra-0.3.5.jar --driver-class-path /tmp/pyspark-cassandra-0.3.5.jar --py-files /tmp/pyspark-cassandra-0.3.5.jar --conf spark.cassandra.connection.host=localhost /tmp/direct_kafka_wordcount5.py localhost:9092 testing
and this is my code it just a little modification from the spark examples, its works but i cant save this data to cassandra....
and this what im trying to do but just with the count result
http://rustyrazorblade.com/2015/05/spark-streaming-with-python-and-kafka/
from __future__ import print_function
import sys
import os
import time
import pyspark_cassandra
import pyspark_cassandra.streaming
from pyspark_cassandra import CassandraSparkContext
import urllib
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
from pyspark.sql.functions import from_unixtime, unix_timestamp, min, max
from pyspark.sql.types import FloatType
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: direct_kafka_wordcount.py <broker_list> <topic>", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 1)
sqlContext = SQLContext(sc)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
counts=lines.count()
counts.saveToCassandra("spark", "count")
counts.pprint()
ssc.start()
ssc.awaitTermination()
i got this error,
Traceback (most recent call last):
File "/tmp/direct_kafka_wordcount5.py", line 88, in
counts.saveToCassandra("spark", "count")
Pyspark Casasndra stopped being updated a while ago and the latest version only supports up to Spark 1.6
https://github.com/TargetHolding/pyspark-cassandra
Additionally
counts=lines.count() // Returns data to the driver (not an RDD)
counts is now an Integer. This means the function saveToCassandra doesn't apply since that is a function for RDDs

Share pyspark UDF across notebooks

I'm constantly reusing udfs across ipython notebooks and am trying to figure out if there is some way to share the code.
I'd love to be able to make a file, let's call it sparktoolz.py
import pyspark.sql.functions as F
import pyspark.sql.types as T
def myfunc(foo):
# do stuff to foo
return transformed_foo
myfunc_udf = F.udf(myfunc, T.SomeType())
Then from any given notebook in the same directory as sparktoolz.py do something like this:
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
sc.addPyFile('sparktoolz.py')
from sparktoolz import myfunc_udf
df = sqlContext.read.parquet('path/to/foo')
stuff = df.select(myfunc_udf(F.col('bar')))
Whenever I try something like this, the notebook can find sparktoolz.py but gives me an ImportError: cannot import name myfunc_udf.

Categories

Resources