Pyspark writestream json value type change

Pyspark writestream json value type change - python

When pyspark writestream in json format, the value it prints is as follows
{"value":"{\"id\": 15, \"tarih\": \"xyz\", \"time\": \"23/01/2023 00:00:00\", \"temperature\": 31.99}"}
But I want to do it like this
{"value":{"id": 15, "tarih": "xyz", "time":"23/01/2023 00:00:00", "temperature": 31.99}}
Is this possible?
import findspark
findspark.init()
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1 pyspark-shell'
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions
appName = "PySpark Datapipline KAFKA-CASANNDRA-REDIS"
master = "local"
spark = SparkSession \
.builder \
.appName(appName) \
.master(master) \
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
lines = spark \
.readStream \
.option('multiLine', True) \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "deneme4") \
.option("startingOffsets", "earliest") \
.option("includeHeaders", "true") \
.option("failOnDataLoss", False)\
.load()\
df = lines.selectExpr("CAST(value AS STRING)")
query = df\
.writeStream\
.format("json") \
.option("path", "C:\\Users\\alper\\OneDrive\\Masaüstü\\fqndeneme\\fqn11") \
.option("checkpointLocation", "C:\\Users\\alper\\OneDrive\\Masaüstü\\fqndeneme11\\")\
.option("maxRecordsPerFile", 1)\
.start()
query.awaitTermination()
Unfortunately, I cannot have a schema, the incoming data will change from time to time. I need to access the values in the Value somehow, please help.

Related

Reading Pointcloud from .pcd to ROS PointCloud2

I want to create a simple python script to read some .pcd files and create a sensor_msgs::PointCloud2 for each in a rosbag.
I tried using the python-pcl library, but I'm probably doing something wrong when adding the points to the data field, because when playing the rosbag and checking with RViz and echoing the topic I get no points.
This is the part where I set the PointCloud2 msg.
pcl_data = pcl.load(metadata_dir + "/" + pcd_path)
# get data
pcl_msg = sensor_msgs.msg.PointCloud2()
pcl_msg.data = np.ndarray.tobytes(pcl_data.to_array())
pcl_msg.header.stamp = rospy.Time(t_us/10000000.0)
pcl_msg.header.frame_id = "robot_1/navcam_sensor"
# Pusblish Pointcloud2 msg
outbag.write("/robot_1/pcl_navcam", pcl_msg, rospy.Time(t_us/10000000.0))
I also tried pypc without any luck as well.
How would you do it? Maybe there is a ToROSMsg method somewhere like in the cpp version of pcl?
Is there a python equivalent for what is very easily available in cpp: pcl::toROSMsg ?
Thank you
Here is the full code of the python script:
#! /usr/bin/env python3
import rospy
import rosbag
import tf2_msgs.msg
import geometry_msgs.msg
import sensor_msgs.msg
import sys
import os
import json
import numpy as np
import tf.transformations as tf_transformations
import pcl
import json
import math
import pypcd
import sensor_msgs.point_cloud2 as pc2
import tf2_msgs.msg._TFMessage
def main():
output_bag_path = dataset_path + "rosbag.bag"
with rosbag.Bag(output_bag_path, 'w') as outbag:
# iterate metadata files with tfs
metadata_dir = dataset_path + "Pointcloud/metadata"
t_first_flag = False
# for filename in os.listdir(metadata_dir):
list_of_files = sorted( filter( lambda x: os.path.isfile(os.path.join(metadata_dir, x)),
os.listdir(metadata_dir) ) )
for filename in list_of_files:
# open json file
json_path = os.path.join(metadata_dir, filename)
json_file = open(json_path)
json_data = json.load(json_file)
# get timestamp
t_us = json_data \
["metadata"] \
["Timestamps"] \
["microsec"]
t_ns, t_s = math.modf(t_us/1000000)
# get camera tf
pos = geometry_msgs.msg.Vector3( \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["translation"][0], \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["translation"][1], \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["translation"][2])
quat = geometry_msgs.msg.Quaternion( \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["orientation"] \
["x"], \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["orientation"] \
["y"], \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["orientation"] \
["z"], \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["orientation"] \
["w"], )
navcam_sensor_tf = geometry_msgs.msg.TransformStamped()
navcam_sensor_tf.header.frame_id = "reu_1/base_link"
navcam_sensor_tf.child_frame_id = "reu_1/navcam_sensor"
navcam_sensor_tf.header.stamp = rospy.Time(t_us/1000000.0)
navcam_sensor_tf.transform.translation = pos
navcam_sensor_tf.transform.rotation = quat
# get base_link tf
pos = geometry_msgs.msg.Vector3( \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["translation"][0], \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["translation"][1], \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["translation"][2])
quat = geometry_msgs.msg.Quaternion( \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["orientation"] \
["x"], \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["orientation"] \
["y"], \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["orientation"] \
["z"], \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["orientation"] \
["w"], )
base_link_tf = geometry_msgs.msg.TransformStamped()
base_link_tf.header.frame_id = "map"
base_link_tf.child_frame_id = "reu_1/base_link"
base_link_tf.header.stamp = rospy.Time(t_us/1000000.0)
base_link_tf.transform.translation = pos
base_link_tf.transform.rotation = quat
# publish TFs
tf_msg = tf2_msgs.msg.TFMessage()
tf_msg.transforms = []
tf_msg.transforms.append(base_link_tf)
outbag.write("/tf", tf_msg, rospy.Time(t_us/1000000.0))
tf_msg = tf2_msgs.msg.TFMessage()
tf_msg.transforms = []
tf_msg.transforms.append(navcam_sensor_tf)
outbag.write("/tf", tf_msg, rospy.Time(t_us/1000000.0))
# open corresponding .pcd file
pcd_path = json_data["data"]["path"]
pcl_data = pcl.load(metadata_dir + "/" + pcd_path)
# pcl_data = pypcd.(metadata_dir + "/" + pcd_path)
# get data
pcl_msg = sensor_msgs.msg.PointCloud2()
pcl_msg.data = np.ndarray.tobytes(pcl_data.to_array())
pcl_msg.header.stamp = rospy.Time(t_us/1000000.0)# t_s, t_ns)
pcl_msg.header.frame_id = "reu_1/navcam_sensor"
# Pusblish Pointcloud2 msg
outbag.write("/reu_1/pcl_navcam", pcl_msg, rospy.Time(t_us/1000000.0))
pass
if __name__ == "__main__":
dataset_path = "/home/---/Documents/datasets/---/"
main()
The base_link and camera tfs come from a json file that also stores a string to associate the .pcd file.

One issue with the code you posted is that it only creates one PointCloud2 message per file. That being said, there is already a package to do what you're hoping, check out this pcl_ros module. You can create a PointCloud2 message and publish it with rosrun pcl_ros pcd_to_pointcloud <file.pcd> [ <interval> ].
Also as of note: if you're running a full ROS desktop install you don't actually need to install pcl libraries individually; they're baked into the default ROS install.

After using checkpoint in Pyspark the program run faster, why?

My spark setting is like that :
spark_conf = SparkConf().setAppName('app_name') \
.setMaster("local[4]") \
.set('spark.executor.memory', "8g") \
.set('spark.executor.cores', 4) \
.set('spark.task.cpus', 1)
sc = SparkContext.getOrCreate(conf=spark_conf)
sc.setCheckpointDir(dirName='checkpoint')
When I do not have any checkpoint in the spark chain and my program is like this:
result = sc.parallelize(group, 4) \
.map(func_read, preservesPartitioning=True)\
.map(func2,preservesPartitioning=True) \
.flatMap(df_to_dic, preservesPartitioning=True) \
.reduceByKey(func3) \
.map(func4, preservesPartitioning=True) \
.reduceByKey(func5) \
.map(write_to_db) \
.count()
Running time is about 8 hours.
But when I use checkpoint and cache RDD like this:
result = sc.parallelize(group, 4) \
.map(func_read, preservesPartitioning=True)\
.map(func2,preservesPartitioning=True) \
.flatMap(df_to_dic, preservesPartitioning=True) \
.reduceByKey(func3) \
.map(func4, preservesPartitioning=True) \
.reduceByKey(func5) \
.map(write_to_db)
result.cache()
result.checkpoint()
result.count()
The program run in about 3 hours. Would you please guide how it is possible that after caching RDD and using checkpoint the program run faster?
Any help would be really appreciated.

tensorflow deeplabv3+ train from scratch train.sh code to recurring 82.2% in VOC val

DATA: VOC 2012 augmented datasets: I am training train_aug with 10582 annotations.
CODE: I am using official deeplabv3+ code.I didn't change the code except bash code.
Pretrained weight: xception_65_imagenet_coco from official zoo
So, this is my train.sh:
python "${WORK_DIR}"/train.py \
--logtostderr \
--train_split="train_aug" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size="513,513" \
--train_batch_size=15 \
--training_number_of_steps=30000 \
--fine_tune_batch_norm=False \
--num_clones=5 \
--base_learning_rate=0.007 \
--tf_initial_checkpoint="${COCO_PRE}/x65-b2u1s2p-d48-2-3x256-sc-cr300k_init.ckpt" \
--train_logdir="${TRAIN_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}"\
--initialize_last_layer=False
Result: I think it should be 82.2% with my configuration.But I got 80%on eval.OS=16 and 80.15% on eval.OS=8 in 30k steps
So my question is : How can I get 82.2%?
edit:09.02.2019:-----------------
I have notice that the fine_tune_batch_norm=false.
and in train.py:
Set to True if one wants to fine-tune the batch norm parameters in
DeepLabv3
So I decide to try fine_tune_batch_norm=true cause training from scratch need to change the BN parameters.
edit------09/07--------------------
Still not working with:
python "${WORK_DIR}"/train.py \
--logtostderr \
--train_split="train_aug" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size="513,513" \
--train_batch_size=15 \
--training_number_of_steps=100000 \
--fine_tune_batch_norm=true \
--num_clones=5 \
--base_learning_rate=0.007 \
--tf_initial_checkpoint="${COCO_PRE}/x65-b2u1s2p-d48-2-3x256-sc-cr300k_init.ckpt" \
--train_logdir="${TRAIN_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}"\
--initialize_last_layer=False
This time the result is even worse.

I reproduce the result. I got 81.5%. I use 40000 steps for the first round, I think 30000 steps might better.
Bash Code:
python "${WORK_DIR}"/train.py \
--logtostderr \
--train_split="train_aug" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size="513,513" \
--train_batch_size=24 \
--base_learning_rate=0.007 \
--training_number_of_steps=30000 \
--fine_tune_batch_norm=true \
--num_clones=8 \
--tf_initial_checkpoint="${COCO_PRE}/x65-b2u1s2p-d48-2-3x256-sc-cr300k_init.ckpt" \
--train_logdir="${TRAIN_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}"\
--initialize_last_layer=true
python "${WORK_DIR}"/train.py \
--logtostderr \
--train_split="train" \
--model_variant="xception_65" \
--atrous_rates=6 \
--atrous_rates=12 \
--atrous_rates=18 \
--output_stride=16 \
--decoder_output_stride=4 \
--train_crop_size="513,513" \
--train_batch_size=24 \
--training_number_of_steps=60000 \
--fine_tune_batch_norm=false \
--num_clones=8 \
--base_learning_rate=0.01 \
--tf_initial_checkpoint="${COCO_PRE}/x65-b2u1s2p-d48-2-3x256-sc-cr300k_init.ckpt" \
--train_logdir="${TRAIN_LOGDIR}" \
--dataset_dir="${PASCAL_DATASET}"\
--initialize_last_layer=true
reference: github

How to process data of small chunks in pandas?

I am trying to process the data after using chunksize parameter. I am getting an error as it the iterator is not converting into a dataframe.
pdvs = dbu.readFromSQL("SELECT GOOGLE.* \
,creation_time \
FROM [dbo].[DM_SOURCE_GOOGLE_DETAILS] AS GOOGLE\
JOIN \
( \
SELECT place_id \
,max(datetime) AS datetime \
, min(datetime) AS creation_time \
FROM [dbo].[DM_SOURCE_GOOGLE_DETAILS] \
GROUP BY place_id \
)AS date_updated \
ON GOOGLE.datetime = date_updated.datetime \
AND GOOGLE.place_id = date_updated.place_id", chunksize = 5
)
combined = pd.DataFrame(pdvs)
# get address
combined['valid'] = True
But the transformation is not working. I am getting the following error
'generator' object does not support item assignment

Fastest method to find all item types with python

I have 5 item types that I have to parse thousands of files (approximately 20kb - 75kb) for:
Item Types
SHA1 hashes
ip addresses
domain names
urls (full thing if possible)
email addresses
I currently use regex to find any items of these nature in thousands of files.
python regex is taking a really long time and I was wondering if there is a better method to identify these item types anywhere in any of my text based flat files?
reSHA1 = r"([A-F]|[0-9]|[a-f]){40}"
reIPv4 = r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|\[\.\])){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
reURL = r"[A-Z0-9\-\.\[\]]+(\.|\[\.\])(XN--CLCHC0EA0B2G2A9GCD|XN--HGBK6AJ7F53BBA|" \
r"XN--HLCJ6AYA9ESC7A|XN--11B5BS3A9AJ6G|XN--MGBERP4A5D4AR|XN--XKC2DL3A5EE0H|XN--80AKHBYKNJ4F|" \
r"XN--XKC2AL3HYE2A|XN--LGBBAT1AD8J|XN--MGBC0A9AZCG|XN--9T4B11YI5A|XN--MGBAAM7A8H|XN--MGBAYH7GPA|" \
r"XN--MGBBH1A71E|XN--FPCRJ9C3D|XN--FZC2C9E2C|XN--YFRO4I67O|XN--YGBI2AMMX|XN--3E0B707E|XN--JXALPDLP|" \
r"XN--KGBECHTV|XN--OGBPF8FL|XN--0ZWM56D|XN--45BRJ9C|XN--80AO21A|XN--DEBA0AD|XN--G6W251D|XN--GECRJ9C|" \
r"XN--H2BRJ9C|XN--J6W193G|XN--KPRW13D|XN--KPRY57D|XN--PGBS0DH|XN--S9BRJ9C|XN--90A3AC|XN--FIQS8S|" \
r"XN--FIQZ9S|XN--O3CW4H|XN--WGBH1C|XN--WGBL6A|XN--ZCKZAH|XN--P1AI|MUSEUM|TRAVEL|AERO|ARPA|ASIA|COOP|" \
r"INFO|JOBS|MOBI|NAME|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|TEL|XXX|AC|AD|AE|AF|AG|AI|AL|AM|AN|AO|AQ|" \
r"AR|AS|AT|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BJ|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CC|CD|CF|CG|CH|CI|CK|" \
r"CL|CM|CN|CO|CR|CU|CV|CW|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EE|EG|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE|" \
r"GF|GG|GH|GI|GL|GM|GN|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|IO|IQ|IR|IS|IT|JE|JM|JO|" \
r"JP|KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MG|MH|MK|ML|MM|MN|MO|" \
r"MP|MQ|MR|MS|MT|MU|MV|MW|MX|MY|MZ|NA|NC|NE|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|PA|PE|PF|PG|PH|PK|PL|PM|PN|PR|" \
r"PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SX|SY|SZ|TC|TD|TF|" \
r"TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TT|TV|TW|TZ|UA|UG|UK|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|YE|YT|ZA|ZM|ZW)" \
r"(/\S+)"
reDomain = r"[A-Z0-9\-\.\[\]]+(\.|\[\.\])(XN--CLCHC0EA0B2G2A9GCD|XN--HGBK6AJ7F53BBA|XN--HLCJ6AYA9ESC7A|" \
r"XN--11B5BS3A9AJ6G|XN--MGBERP4A5D4AR|XN--XKC2DL3A5EE0H|XN--80AKHBYKNJ4F|XN--XKC2AL3HYE2A|" \
r"XN--LGBBAT1AD8J|XN--MGBC0A9AZCG|XN--9T4B11YI5A|XN--MGBAAM7A8H|XN--MGBAYH7GPA|XN--MGBBH1A71E|" \
r"XN--FPCRJ9C3D|XN--FZC2C9E2C|XN--YFRO4I67O|XN--YGBI2AMMX|XN--3E0B707E|XN--JXALPDLP|XN--KGBECHTV|" \
r"XN--OGBPF8FL|XN--0ZWM56D|XN--45BRJ9C|XN--80AO21A|XN--DEBA0AD|XN--G6W251D|XN--GECRJ9C|XN--H2BRJ9C|" \
r"XN--J6W193G|XN--KPRW13D|XN--KPRY57D|XN--PGBS0DH|XN--S9BRJ9C|XN--90A3AC|XN--FIQS8S|XN--FIQZ9S|" \
r"XN--O3CW4H|XN--WGBH1C|XN--WGBL6A|XN--ZCKZAH|XN--P1AI|MUSEUM|TRAVEL|AERO|ARPA|ASIA|COOP|INFO|JOBS|" \
r"MOBI|NAME|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|TEL|XXX|AC|AD|AE|AF|AG|AI|AL|AM|AN|AO|AQ|AR|AS|AT" \
r"|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BJ|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CC|CD|CF|CG|CH|CI|CK|CL|CM|" \
r"CN|CO|CR|CU|CV|CW|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EE|EG|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE|GF|GG|" \
r"GH|GI|GL|GM|GN|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|IO|IQ|IR|IS|IT|JE|JM|JO|JP|" \
r"KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MG|MH|MK|ML|MM|MN|MO|MP" \
r"|MQ|MR|MS|MT|MU|MV|MW|MX|MY|MZ|NA|NC|NE|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|PA|PE|PF|PG|PH|PK|PL|PM|PN|PR|" \
r"PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SX|SY|SZ|TC|TD|TF" \
r"|TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TT|TV|TW|TZ|UA|UG|UK|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|YE|YT|ZA|" \
r"ZM|ZW)\b"
reEmail = r"\b[A-Za-z0-9._%+-]+(#|\[#\])[A-Za-z0-9.-]+(\.|\[\.\])(XN--CLCHC0EA0B2G2A9GCD|XN--HGBK6AJ7F53BBA|" \
r"XN--HLCJ6AYA9ESC7A|XN--11B5BS3A9AJ6G|XN--MGBERP4A5D4AR|XN--XKC2DL3A5EE0H|XN--80AKHBYKNJ4F|" \
r"XN--XKC2AL3HYE2A|XN--LGBBAT1AD8J|XN--MGBC0A9AZCG|XN--9T4B11YI5A|XN--MGBAAM7A8H|XN--MGBAYH7GPA|" \
r"XN--MGBBH1A71E|XN--FPCRJ9C3D|XN--FZC2C9E2C|XN--YFRO4I67O|XN--YGBI2AMMX|XN--3E0B707E|XN--JXALPDLP|" \
r"XN--KGBECHTV|XN--OGBPF8FL|XN--0ZWM56D|XN--45BRJ9C|XN--80AO21A|XN--DEBA0AD|XN--G6W251D|XN--GECRJ9C|" \
r"XN--H2BRJ9C|XN--J6W193G|XN--KPRW13D|XN--KPRY57D|XN--PGBS0DH|XN--S9BRJ9C|XN--90A3AC|XN--FIQS8S|" \
r"XN--FIQZ9S|XN--O3CW4H|XN--WGBH1C|XN--WGBL6A|XN--ZCKZAH|XN--P1AI|MUSEUM|TRAVEL|AERO|ARPA|ASIA|COOP|" \
r"INFO|JOBS|MOBI|NAME|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|TEL|XXX|AC|AD|AE|AF|AG|AI|AL|AM|AN|AO|AQ|" \
r"AR|AS|AT|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BJ|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CC|CD|CF|CG|CH|CI|CK" \
r"|CL|CM|CN|CO|CR|CU|CV|CW|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EE|EG|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE" \
r"|GF|GG|GH|GI|GL|GM|GN|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|IO|IQ|IR|IS|IT|JE|JM|" \
r"JO|JP|KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MG|MH|MK|ML|MM|MN" \
r"|MO|MP|MQ|MR|MS|MT|MU|MV|MW|MX|MY|MZ|NA|NC|NE|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|PA|PE|PF|PG|PH|PK|PL|PM|" \
r"PN|PR|PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SX|SY|SZ|TC" \
r"|TD|TF|TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TT|TV|TW|TZ|UA|UG|UK|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|YE|YT|" \
r"ZA|ZM|ZW)\b"
I am using a
with open(file, 'r') as f:
for m in re.finditer(key, text, re.IGNORECASE):
try:
m = str(m).split('match=')[-1].split("'")[1]
new_file.write(m + '\n')
except:
pass
method to open, find and output to a new file.
Any assistance with speeding up this item and making it more efficient would be grateful.

You probably want:
text = m.group(0)
print(text, file=new_file)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Pyspark writestream json value type change - python

Related

Reading Pointcloud from .pcd to ROS PointCloud2

After using checkpoint in Pyspark the program run faster, why?

tensorflow deeplabv3+ train from scratch train.sh code to recurring 82.2% in VOC val

How to process data of small chunks in pandas?

Fastest method to find all item types with python

Categories

Resources