can't pickle lock objects - python

i get this error when using this code :
def createLabeledPoints(fields):
q1 = fields[1]
q2 = fields[12]
q3 = fields[23]
result = fields[40]
return LabeledPoint(result, array([q1, q2, q3])
spark = SparkSession \
.builder \
.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
dt = df.rdd.map(createLabeledPoints)
model111 = DecisionTree.trainClassifier(dt, numClasses=467,
categoricalFeaturesInfo={0:2,1:2, 2:2}, impurity='gini', maxDepth=30, maxBins=32)
but when i want to save my model "model111" and using it with flask
import cPickle as pickle
pickle.dump(model111, open("rfc1.pkl","wb"))
this give an error :
TypeError: can't pickle lock objects
i am new in python...is there any way to unlock m model so i can use the pickle or can someone please suggest to me any solution

Related

missing id from vertext dataframe in pyspark creating Graphframe

I have writting this code using Python, when I run it, the following errors show up.
spark = SparkSession\
.builder\
.appName("GraphX")\
.getOrCreate()
e = spark.read.parquet("hdfs://localhost:9000/gf/edge")
v = spark.read.parquet("hdfs://localhost:9000/gf/vertex")
s = GraphFrame(v, e)
s.edges.show()
s.vertices.show()

AttributeError: 'DataFrameWriter' object has no attribute 'start'

I am trying to write a code using Kafka, Python and SparK
The problem statement is: Read data from XML and the data consumed will be in the binary format. This data has to be stored in a data frame.
I am getting below error:
Error:
File "C:/Users/HP/PycharmProjects/xml_streaming/ConS.py", line 55, in
.format("console")
AttributeError: 'DataFrameWriter' object has no attribute 'start'
Here is my code for reference:
#import *
# Set spark environments
#os.environ['PYSPARK_PYTHON'] = <PATH>
#os.environ['PYSPARK_DRIVER_PYTHON'] = <PATH>
spark = SparkSession\
.builder\
.master("local[1]")\
.appName("Consumer")\
.getOrCreate()
topic_Name = 'XML_File_Processing3'
consumer = kafka.KafkaConsumer(topic_Name, bootstrap_servers=['localhost:9092'], auto_offset_reset='latest')
kafka_df = spark\
.read \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("kafka.security.protocol", "SSL") \
.option("failOnDataLoss", "false") \
.option("subscribe", topic_Name) \
.load()
#.option("startingOffsets", "earliest") \
print("Loaded to DataFrame kafka_df")
kafka_df.printSchema()
new_df = kafka_df.selectExpr("CAST(value AS STRING)")
schema = ArrayType(StructType()\
.add("book_id", IntegerType())\
.add("author", StringType())\
.add("title", StringType())\
.add("genre",StringType())\
.add("price",IntegerType())\
.add("publish_date", IntegerType())\
.add("description", StringType()))
book_DF = new_df.select(from_json(col("value"), schema).alias("dataf")) #.('data')).select("data.*")
book_DF.printSchema()
#book_DF.select("dataf.author").show()
book_DF.write\
.format("console")\
.start()
I don't have a lot of experience with kafka, but at the end you're using the start() method on the result of book_DF.write.format("console"), which is a DataFrameWriter object. This does not have a start() method.
Do you want to write this as a stream? Then you'll probably need to use something like the writeStream method:
book_DF.writeStream \
.format("kafka") \
.start()
More info + examples can be found here.
If you simply want to print your dataframe to the console you should be able to use the show method for that. So in your case: book_DF.show()
The error is with PySpark. The DataFrameWriter doesn't have a .start() instead use .save()

why pyarrow not detect invalid_row_handler when parsing param as open_csv(**args)

I'm trying add invalid_row_handler to pyarrow ParseOptions when working with rllib but it just ignore it and throw invalid row error.
Test code:
invalid_content = '''src_ip,dst_ip,src_port,dst_port,src_mac,dst_mac,protocol,timestamp,flow_duration,flow_byts_s,flow_pkts_s,fwd_pkts_s,bwd_pkts_s,tot_fwd_pkts,tot_bwd_pkts,totlen_fwd_pkts,totlen_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,bwd_pkt_len_max,bwd_pkt_len_min,bwd_pkt_len_mean,bwd_pkt_len_std,pkt_len_max,pkt_len_min,pkt_len_mean,pkt_len_std,pkt_len_var,fwd_header_len,bwd_header_len,fwd_seg_size_min,fwd_act_data_pkts,flow_iat_mean,flow_iat_max,flow_iat_min,flow_iat_std,fwd_iat_tot,fwd_iat_max,fwd_iat_min,fwd_iat_mean,fwd_iat_std,bwd_iat_tot,bwd_iat_max,bwd_iat_min,bwd_iat_mean,bwd_iat_std,fwd_psh_flags,bwd_psh_flags,fwd_urg_flags,bwd_urg_flags,fin_flag_cnt,syn_flag_cnt,rst_flag_cnt,psh_flag_cnt,ack_flag_cnt,urg_flag_cnt,ece_flag_cnt,down_up_ratio,pkt_size_avg,init_fwd_win_byts,init_bwd_win_byts,active_max,active_min,active_mean,active_std,idle_max,idle_min,idle_mean,idle_std,fwd_byts_b_avg,fwd_pkts_b_avg,bwd_byts_b_avg,bwd_pkts_b_avg,fwd_blk_rate_avg,bwd_blk_rate_avg,fwd_seg_size_avg,bwd_seg_size_avg,cwe_flag_count,subflow_fwd_pkts,subflow_bwd_pkts,subflow_fwd_byts,subflow_bwd_byts
16.12.5.10,192.168.66.180,443,58842,,,6,2022-08-21 13:13:16,121.0,24330578.512396693,16528.92561983471,16528.92561983471,0.0,2,0,2944,0,1472.0,1472.0,1472.0,0.0,0.0,0.0,0.0,0.0,1472,1472,1472.0,0.0,0.0,40,0,20,2,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0.0,1472.0,251,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1472.0,0.0,0,2,0,2944,0
\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'''
source = 'test.csv'
with open(source, 'w') as f:
f.write(invalid_content)
import pyarrow as pa
from pyarrow import csv, fs
parse_options = csv.ParseOptions(delimiter=",", invalid_row_handler=lambda i: 'skip')
fields = 'src_ip,dst_ip,src_port,dst_port,src_mac,dst_mac,' \
'protocol,timestamp,flow_duration,flow_byts_s,flow_pkts_s,' \
'fwd_pkts_s,bwd_pkts_s,tot_fwd_pkts,tot_bwd_pkts,totlen_fwd_pkts,' \
'totlen_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,' \
'fwd_pkt_len_std,bwd_pkt_len_max,bwd_pkt_len_min,bwd_pkt_len_mean,' \
'bwd_pkt_len_std,pkt_len_max,pkt_len_min,pkt_len_mean,pkt_len_std,' \
'pkt_len_var,fwd_header_len,bwd_header_len,fwd_seg_size_min,' \
'fwd_act_data_pkts,flow_iat_mean,flow_iat_max,flow_iat_min,flow_iat_std,' \
'fwd_iat_tot,fwd_iat_max,fwd_iat_min,fwd_iat_mean,fwd_iat_std,bwd_iat_tot,' \
'bwd_iat_max,bwd_iat_min,bwd_iat_mean,bwd_iat_std,fwd_psh_flags,bwd_psh_flags,' \
'fwd_urg_flags,bwd_urg_flags,fin_flag_cnt,syn_flag_cnt,rst_flag_cnt,psh_flag_cnt,' \
'ack_flag_cnt,urg_flag_cnt,ece_flag_cnt,down_up_ratio,pkt_size_avg,' \
'init_fwd_win_byts,init_bwd_win_byts,active_max,active_min,active_mean,active_std,' \
'idle_max,idle_min,idle_mean,idle_std,fwd_byts_b_avg,fwd_pkts_b_avg,bwd_byts_b_avg,' \
'bwd_pkts_b_avg,fwd_blk_rate_avg,bwd_blk_rate_avg,fwd_seg_size_avg,bwd_seg_size_avg,' \
'cwe_flag_count,subflow_fwd_pkts,subflow_bwd_pkts,subflow_fwd_byts,subflow_bwd_byts'\
.split(',')
# working well
f = fs.LocalFileSystem().open_input_stream(source)
schema = {i: pa.string() for i in fields}
convert_options = csv.ConvertOptions(column_types=schema)
read_options = csv.ReadOptions(use_threads=False)
reader = csv.open_csv(f, read_options=read_options, **{'parse_options': parse_options, 'convert_options': convert_options})
batch = reader.read_next_batch()
table = pa.Table.from_batches([batch], schema=None)
print(batch)
# end working well
# working fail with invalid row error
import ray
from ray.data.dataset_pipeline import DatasetPipeline
from ray.data.datasource import FastFileMetadataProvider
from ray.runtime_env import RuntimeEnv
import os
os.environ["RAY_DATASET_FORCE_LOCAL_METADATA"] = "1"
runtime_env = RuntimeEnv(pip={
"packages": ["tensorflow==2.7.0", "numpy==1.21.4", "six==1.16.0", "numba==0.56.0", "pyarrow==9.0.0"],
"pip_check": False,
"pip_version": "==22.2.2;python_version=='3.7.3'"})
ray.init(runtime_env=runtime_env)
data_files = [source]
pipe: DatasetPipeline = ray.data.read_csv(
data_files,
meta_provider=FastFileMetadataProvider(),
parse_options=parse_options,
convert_options=convert_options,
).window(blocks_per_window=500)
print(pipe.count())
# end working fail with invalid row error
After re-init ParseOptions by edit code of rllib at https://github.com/ray-project/ray/blob/e4ce38d001dbbe09cd21c497fedd03d692b2be3e/python/ray/data/datasource/csv_datasource.py#L34
From
reader = csv.open_csv(f, read_options=read_options, **reader_args)
To
parse_options = reader_args["parse_options"]
reader_args["parse_options"]=csv.ParseOptions(delimiter=parse_options.delimiter, invalid_row_handler=parse_options.invalid_row_handler)
reader = csv.open_csv(f, read_options=read_options, **reader_args)
Then it works fine.
Could any help me explain why csv.open_csv(..., **reader_args) in ray doesn't work when parse_options is not re-initialize? and is there any way to bypass this without edit ray code?
As tracked in https://github.com/ray-project/ray/issues/28326, this is an issue on the pyarrow side with deserializing ParseOptions objects. We've filed a fix for Ray in https://github.com/ray-project/ray/pull/28327 which will be included in the 2.1 release. Once the fix is merged you can use the latest master wheels (the "nightly wheels") to run your job without issues.
We've also filed a fix on the pyarrow side in https://github.com/apache/arrow/pull/14061 and it's likely to be included in one of the next releases.
In the meantime, to use your current Ray version without any changes, you can create your own custom CSV datasource to work around the problem:
class CustomCSVDatasource(ray.data.datasource.CSVDatasource):
def _read_stream(
self, f: "pyarrow.NativeFile", path: str, **reader_args
):
import pyarrow
from pyarrow import csv
read_options = reader_args.pop(
"read_options", csv.ReadOptions(use_threads=False)
)
parse_options = reader_args.pop("parse_options", csv.ParseOptions())
# Re-init invalid row handler: https://issues.apache.org/jira/browse/ARROW-17641
if hasattr(parse_options, "invalid_row_handler"):
parse_options.invalid_row_handler = parse_options.invalid_row_handler
reader = csv.open_csv(
f, read_options=read_options, parse_options=parse_options, **reader_args
)
schema = None
while True:
try:
batch = reader.read_next_batch()
table = pyarrow.Table.from_batches([batch], schema=schema)
if schema is None:
schema = table.schema
yield table
except StopIteration:
return
data_files = [source]
pipe: DatasetPipeline = ray.data.read_datasource(
CustomCSVDatasource(),
paths=data_files,
meta_provider=FastFileMetadataProvider(),
parse_options=parse_options,
convert_options=convert_options,
).window(blocks_per_window=500)

Write paritioned csv files to a single folder - Pyspark

While using partitionby() in pyspark, what approach should I follow to write csv files in one single folder rather than multiple folders ? Any suggested solution ?
Code
from pyspark.sql import SparkSession
from pyspark import SparkConf
import pyodbc
appName = "PySpark Teradata Example"
master = "local"
conf = SparkConf() # create the configuration
conf.set("spark.repl.local.jars", "terajdbc4.jar")
conf.set("spark.executor.extraClassPath", "terajdbc4.jar")
conf.set("spark.driver.extraClassPath", "terajdbc4.jar")
spark = SparkSession.builder \
.config(conf=conf) \
.appName(appName) \
.master(master) \
.getOrCreate()
#input table name
table = "my_table_1"
df =spark.read \
.format('jdbc') \
.option('url', 'jdbc:teradata://xxx.xxx.xx.xx') \
.option('user', 'dbc') \
.option('password', 'dbc') \
.option('driver', 'com.teradata.jdbc.TeraDriver') \
.option('STRICT_NAMES', 'OFF') \
.option('query',"Select eno, CAST(edata.asJSONText() AS VARCHAR(32000)) as edata from AdventureWorksDW."+table)\
.load()
df.show()
df = df.withColumn("id_tmp", F.col(df.columns[0]) % 4).orderBy("id_tmp")
df.coalesce(4)
.write \
.option("header",True) \
.mode("overwrite") \
.partitionBy("id_tmp") \
.option("sep","|")\
.format("csv") \
.save("C:\\Data\\"+table+"\\")
It is giving multiple folders with multiple CSV as an output. How to download it to a single folder ? Also, how can we change the name of the file while writing it to the folder ?
df = df.repartition(1) will reset the amount of partitions to 1, but as Kafels mentioned, it is better to use coalesce:
df = df.coalesce(1)
more info:
https://stackoverflow.com/a/31675351
https://stackoverflow.com/a/40983145
source:
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.repartition.html
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.coalesce.html

create labeledpoint from mongodb using python

I want to create labeledpoint from mongodb using python,
I already tried to do that with a csv file instead of mongodb
here is the code of function that return the labeledpoint
def createLabeledPoints(fields):
q1 = int(fields[0])
q2 = int(fields[1])
result = int(fields[38])
return LabeledPoint(result, array([q1, q2, q3))
this code works for me with csv file
and I get my collection from mongodb as a pandas dataframe using the code below
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db1 = client.newumc
collection1 = db.data_classification
rawData1 = DataFrame(list(collection.find({})))
and I get each field using the code below
field_for_test = collection.find({}, {'field_from_mongodb':1,'_id':0})
i solved the problem by using
spark = SparkSession \
.builder \
.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
field1 = df[1]
field2 = df[2]

Categories

Resources