I want to create labeledpoint from mongodb using python,
I already tried to do that with a csv file instead of mongodb
here is the code of function that return the labeledpoint
def createLabeledPoints(fields):
q1 = int(fields[0])
q2 = int(fields[1])
result = int(fields[38])
return LabeledPoint(result, array([q1, q2, q3))
this code works for me with csv file
and I get my collection from mongodb as a pandas dataframe using the code below
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db1 = client.newumc
collection1 = db.data_classification
rawData1 = DataFrame(list(collection.find({})))
and I get each field using the code below
field_for_test = collection.find({}, {'field_from_mongodb':1,'_id':0})
i solved the problem by using
spark = SparkSession \
.builder \
.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
field1 = df[1]
field2 = df[2]
Related
I have writting this code using Python, when I run it, the following errors show up.
spark = SparkSession\
.builder\
.appName("GraphX")\
.getOrCreate()
e = spark.read.parquet("hdfs://localhost:9000/gf/edge")
v = spark.read.parquet("hdfs://localhost:9000/gf/vertex")
s = GraphFrame(v, e)
s.edges.show()
s.vertices.show()
The below code gives following error:
import numpy as np
import pandas as pd
import json
from datetime import datetime
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, EnvironmentSettings, TableEnvironment
from pyflink.table.expressions import *
from pyflink.table.window import Tumble
from pyflink.table.types import DataTypes
from pyflink.table.udf import udf
# Create streaming environment
env = StreamExecutionEnvironment.get_execution_environment()
settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
# create table environment
tbl_env = StreamTableEnvironment.create(stream_execution_environment=env,environment_settings=settings)
kafka_jar = 'flink-sql-connector-kafka_2.11-1.13.0.jar'
tbl_env.get_config()\
.get_configuration()\
.set_string("pipeline.jars", "file://{}".format(kafka_jar))
#udf(result_type=DataTypes.ROW([
DataTypes.FIELD("date", DataTypes.STRING()),
DataTypes.FIELD("id", DataTypes.STRING()),
DataTypes.FIELD("value", DataTypes.STRING()),
DataTypes.FIELD("ts", DataTypes.STRING())
]),
func_type='pandas')
def transform(df: pd.DataFrame) -> pd.DataFrame:
d = {}
d['foo1'] = 1
d['foo2'] = 2
df = pd.DataFrame(list(d.items()))
df['date'] = datetime.now().strftime('%Y-%m-%d')
df['ts'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
return df
#######################################################################
# Create Kafka Source Table with DDL
#######################################################################
topic, bootstrap_servers = 'test_201', 'localhost:9092'
src_ddl = """
CREATE TABLE source_tbl (
group VARCHAR,
id VARCHAR,
value VARCHAR,
t BIGINT,
proctime AS PROCTIME()
) WITH (
'connector' = 'kafka',
'topic' = '%s',
'properties.bootstrap.servers' = '%s',
'properties.group.id' = 'group',
'format' = 'json'
)
"""%(topic, bootstrap_servers)
tbl_env.execute_sql(src_ddl)
# create and initiate loading of source Table
tbl = tbl_env.from_path('source_tbl')
print('\nSource Schema')
tbl.print_schema()
tbl = tbl.filter(col('group') == 'group 1') \
.window(Tumble.over(lit(5).seconds).on(col('proctime')).alias('w')) \
.group_by(col('group'),col('id'),col('w')) \
.select(col('id'),col('value').max.alias('value'))
.map(transform).execute().print()
Below questions I hope the community could help me with:
AssertionError: The result length '3' of Pandas UDF 'transform' is not equal to the input length '1'. I need to add extra records in the df, but I get this error. How to overcome it?
How can I convert an unbounded table, like the one I generate here due to streaming data, into a bounded table? This would help me use the df = tbl.to_pandas() command, which is currently failing. What I am looking for is an equivalent method in PyFlink, for the forEachBatch() method in Spark (https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#foreachbatch).
Thanks!
While using partitionby() in pyspark, what approach should I follow to write csv files in one single folder rather than multiple folders ? Any suggested solution ?
Code
from pyspark.sql import SparkSession
from pyspark import SparkConf
import pyodbc
appName = "PySpark Teradata Example"
master = "local"
conf = SparkConf() # create the configuration
conf.set("spark.repl.local.jars", "terajdbc4.jar")
conf.set("spark.executor.extraClassPath", "terajdbc4.jar")
conf.set("spark.driver.extraClassPath", "terajdbc4.jar")
spark = SparkSession.builder \
.config(conf=conf) \
.appName(appName) \
.master(master) \
.getOrCreate()
#input table name
table = "my_table_1"
df =spark.read \
.format('jdbc') \
.option('url', 'jdbc:teradata://xxx.xxx.xx.xx') \
.option('user', 'dbc') \
.option('password', 'dbc') \
.option('driver', 'com.teradata.jdbc.TeraDriver') \
.option('STRICT_NAMES', 'OFF') \
.option('query',"Select eno, CAST(edata.asJSONText() AS VARCHAR(32000)) as edata from AdventureWorksDW."+table)\
.load()
df.show()
df = df.withColumn("id_tmp", F.col(df.columns[0]) % 4).orderBy("id_tmp")
df.coalesce(4)
.write \
.option("header",True) \
.mode("overwrite") \
.partitionBy("id_tmp") \
.option("sep","|")\
.format("csv") \
.save("C:\\Data\\"+table+"\\")
It is giving multiple folders with multiple CSV as an output. How to download it to a single folder ? Also, how can we change the name of the file while writing it to the folder ?
df = df.repartition(1) will reset the amount of partitions to 1, but as Kafels mentioned, it is better to use coalesce:
df = df.coalesce(1)
more info:
https://stackoverflow.com/a/31675351
https://stackoverflow.com/a/40983145
source:
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.repartition.html
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.coalesce.html
I am trying to read a file allocated in azure datalake gen2 into spark dataframe using python.
Code is
from pyspark import SparkConf
from pyspark.sql import SparkSession
# create spark session
key = "some_key"
appName = "DataExtract"
master = "local[*]"
sparkConf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("fs.azure.account.key.myaccount.dfs.core.windows.net", key)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
data_csv="abfs://test-file-system#myaccount.dfs.core.windows.net/data.csv"
data_out = "abfs://test-file-system#myaccount.dfs.core.windows.net/data_out.csv"
# read csv
df = self.spark_session.read.csv(data_csv)
# write csv
df.write.csv(data_out)
The file is read and is written well, but I am getting following error
ERROR AzureBlobFileSystemStore: Failed to parse the date Thu, 09 Sep 2021 10:12:34 GMT
Date seems to be file creation date.
How can I parse the date to avoid getting the error?
I tried reproducing the same issue and found it is with these lines that is causing the error.
data_csv="abfs://test-file-system#myaccount.dfs.core.windows.net/data.csv" data_out =
"abfs://test-file-system#myaccount.dfs.core.windows.net/data_out.csv"
# read csv df = self.spark_session.read.csv(data_csv) ```
Here is the code that worked for me when I tried replacing the above lines of code i.e.. abfs to abfss
from pyspark import SparkConf
from pyspark.sql import SparkSession
# create spark session
key = "<Your Storage Account Key>"
appName = "<Synapse App Name>"
master = "local[*]"
sparkConf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("fs.azure.account.key.<Storage Account Name>.dfs.core.windows.net", key)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
data_csv="abfss://<ContainerName>#<Storage Account Name>.dfs.core.windows.net/<Directory>"
# read csv
df1 = spark.read.option('header','true')\
.option('delimiter', ',')\
.csv(data_csv + '/sample1.csv')
df1.show()
# write csv
df2 = df1.write.csv(data_csv + '/<Give the name of blob you want to write to>.csv')
else you can even try the below code which perfectly worked for me
from pyspark.sql import SparkSession
from pyspark.sql.types import *
account_name = "<StorageAccount Name>"
container_name = "<Storage Account Container Name>"
relative_path = "<Directory path>"
adls_path = 'abfss://%s#%s.dfs.core.windows.net/%s'%(container_name,account_name,relative_path)
dataframe1 = spark.read.option('header','true')\
.option('delimiter', ',')\
.csv(adls_path + '/sample1.csv')
dataframe1.show()
dataframe2 = dataframe1.write.csv(adls_path + '/<Give the name of blob you want to write to>.csv')
REFERENCE :
Synapse Spark – Reading CSV files from Azure Data Lake Storage Gen 2 with Synapse Spark using Python - SQL Stijn (sql-stijn.com)
i get this error when using this code :
def createLabeledPoints(fields):
q1 = fields[1]
q2 = fields[12]
q3 = fields[23]
result = fields[40]
return LabeledPoint(result, array([q1, q2, q3])
spark = SparkSession \
.builder \
.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
dt = df.rdd.map(createLabeledPoints)
model111 = DecisionTree.trainClassifier(dt, numClasses=467,
categoricalFeaturesInfo={0:2,1:2, 2:2}, impurity='gini', maxDepth=30, maxBins=32)
but when i want to save my model "model111" and using it with flask
import cPickle as pickle
pickle.dump(model111, open("rfc1.pkl","wb"))
this give an error :
TypeError: can't pickle lock objects
i am new in python...is there any way to unlock m model so i can use the pickle or can someone please suggest to me any solution