I am trying to process the data after using chunksize parameter. I am getting an error as it the iterator is not converting into a dataframe.
pdvs = dbu.readFromSQL("SELECT GOOGLE.* \
,creation_time \
FROM [dbo].[DM_SOURCE_GOOGLE_DETAILS] AS GOOGLE\
JOIN \
( \
SELECT place_id \
,max(datetime) AS datetime \
, min(datetime) AS creation_time \
FROM [dbo].[DM_SOURCE_GOOGLE_DETAILS] \
GROUP BY place_id \
)AS date_updated \
ON GOOGLE.datetime = date_updated.datetime \
AND GOOGLE.place_id = date_updated.place_id", chunksize = 5
)
combined = pd.DataFrame(pdvs)
# get address
combined['valid'] = True
But the transformation is not working. I am getting the following error
'generator' object does not support item assignment
Related
I want to create a simple python script to read some .pcd files and create a sensor_msgs::PointCloud2 for each in a rosbag.
I tried using the python-pcl library, but I'm probably doing something wrong when adding the points to the data field, because when playing the rosbag and checking with RViz and echoing the topic I get no points.
This is the part where I set the PointCloud2 msg.
pcl_data = pcl.load(metadata_dir + "/" + pcd_path)
# get data
pcl_msg = sensor_msgs.msg.PointCloud2()
pcl_msg.data = np.ndarray.tobytes(pcl_data.to_array())
pcl_msg.header.stamp = rospy.Time(t_us/10000000.0)
pcl_msg.header.frame_id = "robot_1/navcam_sensor"
# Pusblish Pointcloud2 msg
outbag.write("/robot_1/pcl_navcam", pcl_msg, rospy.Time(t_us/10000000.0))
I also tried pypc without any luck as well.
How would you do it? Maybe there is a ToROSMsg method somewhere like in the cpp version of pcl?
Is there a python equivalent for what is very easily available in cpp: pcl::toROSMsg ?
Thank you
Here is the full code of the python script:
#! /usr/bin/env python3
import rospy
import rosbag
import tf2_msgs.msg
import geometry_msgs.msg
import sensor_msgs.msg
import sys
import os
import json
import numpy as np
import tf.transformations as tf_transformations
import pcl
import json
import math
import pypcd
import sensor_msgs.point_cloud2 as pc2
import tf2_msgs.msg._TFMessage
def main():
output_bag_path = dataset_path + "rosbag.bag"
with rosbag.Bag(output_bag_path, 'w') as outbag:
# iterate metadata files with tfs
metadata_dir = dataset_path + "Pointcloud/metadata"
t_first_flag = False
# for filename in os.listdir(metadata_dir):
list_of_files = sorted( filter( lambda x: os.path.isfile(os.path.join(metadata_dir, x)),
os.listdir(metadata_dir) ) )
for filename in list_of_files:
# open json file
json_path = os.path.join(metadata_dir, filename)
json_file = open(json_path)
json_data = json.load(json_file)
# get timestamp
t_us = json_data \
["metadata"] \
["Timestamps"] \
["microsec"]
t_ns, t_s = math.modf(t_us/1000000)
# get camera tf
pos = geometry_msgs.msg.Vector3( \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["translation"][0], \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["translation"][1], \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["translation"][2])
quat = geometry_msgs.msg.Quaternion( \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["orientation"] \
["x"], \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["orientation"] \
["y"], \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["orientation"] \
["z"], \
json_data["metadata"] \
["pose_robotFrame_sensorFrame"] \
["data"] \
["orientation"] \
["w"], )
navcam_sensor_tf = geometry_msgs.msg.TransformStamped()
navcam_sensor_tf.header.frame_id = "reu_1/base_link"
navcam_sensor_tf.child_frame_id = "reu_1/navcam_sensor"
navcam_sensor_tf.header.stamp = rospy.Time(t_us/1000000.0)
navcam_sensor_tf.transform.translation = pos
navcam_sensor_tf.transform.rotation = quat
# get base_link tf
pos = geometry_msgs.msg.Vector3( \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["translation"][0], \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["translation"][1], \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["translation"][2])
quat = geometry_msgs.msg.Quaternion( \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["orientation"] \
["x"], \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["orientation"] \
["y"], \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["orientation"] \
["z"], \
json_data["metadata"] \
["pose_fixedFrame_robotFrame"] \
["data"] \
["orientation"] \
["w"], )
base_link_tf = geometry_msgs.msg.TransformStamped()
base_link_tf.header.frame_id = "map"
base_link_tf.child_frame_id = "reu_1/base_link"
base_link_tf.header.stamp = rospy.Time(t_us/1000000.0)
base_link_tf.transform.translation = pos
base_link_tf.transform.rotation = quat
# publish TFs
tf_msg = tf2_msgs.msg.TFMessage()
tf_msg.transforms = []
tf_msg.transforms.append(base_link_tf)
outbag.write("/tf", tf_msg, rospy.Time(t_us/1000000.0))
tf_msg = tf2_msgs.msg.TFMessage()
tf_msg.transforms = []
tf_msg.transforms.append(navcam_sensor_tf)
outbag.write("/tf", tf_msg, rospy.Time(t_us/1000000.0))
# open corresponding .pcd file
pcd_path = json_data["data"]["path"]
pcl_data = pcl.load(metadata_dir + "/" + pcd_path)
# pcl_data = pypcd.(metadata_dir + "/" + pcd_path)
# get data
pcl_msg = sensor_msgs.msg.PointCloud2()
pcl_msg.data = np.ndarray.tobytes(pcl_data.to_array())
pcl_msg.header.stamp = rospy.Time(t_us/1000000.0)# t_s, t_ns)
pcl_msg.header.frame_id = "reu_1/navcam_sensor"
# Pusblish Pointcloud2 msg
outbag.write("/reu_1/pcl_navcam", pcl_msg, rospy.Time(t_us/1000000.0))
pass
if __name__ == "__main__":
dataset_path = "/home/---/Documents/datasets/---/"
main()
The base_link and camera tfs come from a json file that also stores a string to associate the .pcd file.
One issue with the code you posted is that it only creates one PointCloud2 message per file. That being said, there is already a package to do what you're hoping, check out this pcl_ros module. You can create a PointCloud2 message and publish it with rosrun pcl_ros pcd_to_pointcloud <file.pcd> [ <interval> ].
Also as of note: if you're running a full ROS desktop install you don't actually need to install pcl libraries individually; they're baked into the default ROS install.
My spark setting is like that :
spark_conf = SparkConf().setAppName('app_name') \
.setMaster("local[4]") \
.set('spark.executor.memory', "8g") \
.set('spark.executor.cores', 4) \
.set('spark.task.cpus', 1)
sc = SparkContext.getOrCreate(conf=spark_conf)
sc.setCheckpointDir(dirName='checkpoint')
When I do not have any checkpoint in the spark chain and my program is like this:
result = sc.parallelize(group, 4) \
.map(func_read, preservesPartitioning=True)\
.map(func2,preservesPartitioning=True) \
.flatMap(df_to_dic, preservesPartitioning=True) \
.reduceByKey(func3) \
.map(func4, preservesPartitioning=True) \
.reduceByKey(func5) \
.map(write_to_db) \
.count()
Running time is about 8 hours.
But when I use checkpoint and cache RDD like this:
result = sc.parallelize(group, 4) \
.map(func_read, preservesPartitioning=True)\
.map(func2,preservesPartitioning=True) \
.flatMap(df_to_dic, preservesPartitioning=True) \
.reduceByKey(func3) \
.map(func4, preservesPartitioning=True) \
.reduceByKey(func5) \
.map(write_to_db)
result.cache()
result.checkpoint()
result.count()
The program run in about 3 hours. Would you please guide how it is possible that after caching RDD and using checkpoint the program run faster?
Any help would be really appreciated.
I have designed a Python SQLite API which interfaces with a GUI. The GUI allows the user to select a given column whose data will be summed for each month. From what I have learned from https://docs.python.org/2/library/sqlite3.html I know that the way I’ve written this makes my code vulnerable to an SQL injection attack; I’ve assembled my query using Python’s string operations. However, I am unable to make this module work doing it the “right” way; using the DB-API’s parameter substitution to put a “?” as a placeholder wherever you want to use a value. I’m guessing the issue is that I want to make a table column the variable and not a value. Please help me to restructure this module so that it is more secure and less vulnerable to an SQL injection attack.
The code below works (it functions as I would like it to) I just know that it is not the correct/most secure way to do this.
def queryEntireCategoryAllEmployees(self, column):
table_column = 'Name_Data_AllDaySums.%s' % column
cursor = self.conn.execute("SELECT \
SUBSTR(data_date,1,7), \
SUM(%s) \
FROM ( \
SELECT \
SS_Installations.data_date AS 'data_date', \
SS_Installations.Installations_day_sum, \
SS_PM_Site_Visits.PM_Site_Visits_day_sum, \
SS_Rpr_Maint_Site_Visits.Inst_Repair_or_Maintenance_on_Site_day_sum, \
SS_Rmt_Hrdwr_Spt.Rmt_Hardware_Support_day_sum, \
SS_Rmt_Sftwr_Spt.Rmt_Software_Support_day_sum, \
SS_Rpr_Mant_RFB_in_House.Inst_Repair_Maint_Rfb_In_House_day_sum, \
Miscellaneous.Miscellaneous_day_sum, \
SS_Doc_Gen.Document_Generation_day_sum, \
SS_Inter_Dep_Spt.Inter_Dep_Spt_day_sum, \
SS_Online_Training.Online_Training_day_sum, \
SS_Onsite_Training.Onsite_Training_day_sum, \
SS_In_House_Training.In_House_Training_day_sum, \
Validation_Duties.Validation_Duties_day_sum \
FROM \
SS_Installations \
INNER JOIN SS_PM_Site_Visits ON \
SS_Installations.employee_clk_no = SS_PM_Site_Visits.employee_clk_no AND \
SS_Installations.data_date = SS_PM_Site_Visits.data_date \
INNER JOIN SS_Rpr_Maint_Site_Visits ON \
SS_Installations.employee_clk_no = SS_Rpr_Maint_Site_Visits.employee_clk_no AND \
SS_PM_Site_Visits.data_date = SS_Rpr_Maint_Site_Visits.data_date \
INNER JOIN SS_Rmt_Hrdwr_Spt ON \
SS_Installations.employee_clk_no = SS_Rmt_Hrdwr_Spt.employee_clk_no AND \
SS_Rpr_Maint_Site_Visits.data_date = SS_Rmt_Hrdwr_Spt.data_date \
INNER JOIN SS_Rmt_Sftwr_Spt ON \
SS_Installations.employee_clk_no = SS_Rmt_Sftwr_Spt.employee_clk_no AND \
SS_Rmt_Hrdwr_Spt.data_date = SS_Rmt_Sftwr_Spt.data_date \
INNER JOIN SS_Rpr_Mant_RFB_in_House ON \
SS_Installations.employee_clk_no = SS_Rpr_Mant_RFB_in_House.employee_clk_no AND \
SS_Rmt_Sftwr_Spt.data_date = SS_Rpr_Mant_RFB_in_House.data_date \
INNER JOIN Miscellaneous ON \
SS_Installations.employee_clk_no = Miscellaneous.employee_clk_no AND \
SS_Rpr_Mant_RFB_in_House.data_date = Miscellaneous.data_date \
INNER JOIN SS_Doc_Gen ON \
SS_Installations.employee_clk_no = SS_Doc_Gen.employee_clk_no AND \
Miscellaneous.data_date = SS_Doc_Gen.data_date \
INNER JOIN SS_Inter_Dep_Spt ON \
SS_Installations.employee_clk_no = SS_Inter_Dep_Spt.employee_clk_no AND \
SS_Doc_Gen.data_date = SS_Inter_Dep_Spt.data_date \
INNER JOIN SS_Online_Training ON \
SS_Installations.employee_clk_no = SS_Online_Training.employee_clk_no AND \
SS_Inter_Dep_Spt.data_date = SS_Online_Training.data_date \
INNER JOIN SS_Onsite_Training ON \
SS_Installations.employee_clk_no = SS_Onsite_Training.employee_clk_no AND \
SS_Online_Training.data_date = SS_Onsite_Training.data_date \
INNER JOIN SS_In_House_Training ON \
SS_Installations.employee_clk_no = SS_In_House_Training.employee_clk_no AND \
SS_Onsite_Training.data_date = SS_In_House_Training.data_date \
INNER JOIN Validation_Duties ON \
SS_Installations.employee_clk_no = Validation_Duties.employee_clk_no AND \
SS_In_House_Training.data_date = Validation_Duties.data_date \
WHERE \
(SS_Installations.Installations_day_sum != 0 OR \
SS_PM_Site_Visits.PM_Site_Visits_day_sum !=0 OR \
SS_Rpr_Maint_Site_Visits.Inst_Repair_or_Maintenance_on_Site_day_sum != 0 OR \
SS_Rmt_Hrdwr_Spt.Rmt_Hardware_Support_day_sum != 0 OR \
SS_Rmt_Sftwr_Spt.Rmt_Software_Support_day_sum != 0 OR \
SS_Rpr_Mant_RFB_in_House.Inst_Repair_Maint_Rfb_In_House_day_sum != 0 OR \
Miscellaneous.Miscellaneous_day_sum != 0 OR \
SS_Doc_Gen.Document_Generation_day_sum != 0 OR \
SS_Inter_Dep_Spt.Inter_Dep_Spt_day_sum != 0 OR \
SS_Online_Training.Online_Training_day_sum != 0 OR \
SS_Onsite_Training.Onsite_Training_day_sum != 0 OR \
SS_In_House_Training.In_House_Training_day_sum != 0 OR \
Validation_Duties.Validation_Duties_day_sum != 0)) Name_Data_AllDaySums \
GROUP BY SUBSTR(data_date,1,7) \
ORDER BY SUBSTR(data_date,1,7) ASC" % table_column)
dataList = cursor.fetchall()
return dataList
To start, I would read up on this incredibly informative SO post on preventing SQL injection in PHP, as many of the principles apply: How can I prevent SQL Injection in PHP?
Additionally, because you are working with SQL Server, I would consider creating a stored procedure and running it with the EXEC command in T-SQL and passing your column name as a parameter (since your query seems to only dynamically change based on the column), similar to this MSSQL Docs example Execute a Stored Procedure and using this SO thread for dynamically changing a query based on a parameter Can I Pass Column Name As Input...
Doing so this way will help you to obscure your code from prying eyes and also secure it from injection attacks as you will be able to validate that the input matches what you expect.
Finally, consider using a drop-down list of columns to choose from so that the end user can only pick a pre-defined set of inputs and thus make your application even more secure. This approach as well as obscuring the code in a stored procedure will help also make it much easier to push out updates over time.
I am currently using IBM Data Scientist Workbench with Jupyter notebooks and Spark.
I am trying to read several CSV files to a DF and then applying some transformations to it in order to create a final dataframe with merged data from the different CSV files, but for some reason I am getting this error:
Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.Arrays.copyOf(Arrays.java:2367)
at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:130)
at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:114)
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:415)
at java.lang.StringBuilder.append(StringBuilder.java:132)
The code I am using is as follows:
i=0
count = 0
var_name = []
schema = StructType([])
df1 = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_ocurrences = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_count = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_merged = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_complete = sqlContext.createDataFrame(sc.emptyRDD(), schema)
FINAL = sqlContext.createDataFrame(sc.emptyRDD(), schema)
for file in os.listdir('/resources/data/test_variables/'):
df1 = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("/resources/data/test_variables/"+file)
#SKIP SERIES WITH ONLY 0s
count = df1.groupBy().sum("Bit_value")
if count.select("sum(Bit_value)").collect()[0][0] == 0:
continue
#
i+=1
# AGGREGATION
df1 = df1.withColumn("Interval", ((df1.Timestamp.cast("long") / 1).cast("long") * 1).cast("timestamp"))
# COUNT 1s
df1_ocurrences = df1.groupBy("Interval").sum("Bit_value").sort("Interval")
df1_ocurrences = df1_ocurrences.withColumnRenamed("sum(Bit_value)", "Sum_df1")
# COUNT TOTAL
df1_count = df1.groupBy("Interval").count().sort("Interval")
df1_count = df1_count.withColumnRenamed("count", "Total_df1")
# MERGING
df1_merged = df1_ocurrences.join(df1_count, ["Interval"]).sort("Interval")
var_name = file.split(".")
df1_complete = df1_merged.withColumn(var_name[0], df1_merged.Sum_df1 / df1_merged.Total_df1)
df1_complete = df1_complete.drop('Sum_df1')
df1_complete = df1_complete.drop('Total_df1')
#FINAL DATAFRAME
if i == 1:
FINAL = df1_complete
else:
FINAL = FINAL.join(df1_complete, ["Interval"]).sort("Interval")
Any advice on this? Maybe I am not writing the most efficient code but I am new to Spark.
Too much time spent on GC and too little memory is freed up: https://developer.ibm.com/hadoop/2016/02/16/beginners-guide-apache-spark-troubleshooting/
In addition to recomendation in above article what worked for me in jyputer is this:
spark = SparkSession.builder \
.appName("GBT Model") \
.config("spark.executor.memory", "2000mb") \
.master("local[*]") \
.config("spark.executor.cores", "4") \
.config("spark.yarn.executor.memoryOverhead",200) \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.default.parallelism", "4") \
.getOrCreate()
Note spark.yarn.executor.memoryOverhead is set to 10% of executor memory.
I have 5 item types that I have to parse thousands of files (approximately 20kb - 75kb) for:
Item Types
SHA1 hashes
ip addresses
domain names
urls (full thing if possible)
email addresses
I currently use regex to find any items of these nature in thousands of files.
python regex is taking a really long time and I was wondering if there is a better method to identify these item types anywhere in any of my text based flat files?
reSHA1 = r"([A-F]|[0-9]|[a-f]){40}"
reIPv4 = r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|\[\.\])){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
reURL = r"[A-Z0-9\-\.\[\]]+(\.|\[\.\])(XN--CLCHC0EA0B2G2A9GCD|XN--HGBK6AJ7F53BBA|" \
r"XN--HLCJ6AYA9ESC7A|XN--11B5BS3A9AJ6G|XN--MGBERP4A5D4AR|XN--XKC2DL3A5EE0H|XN--80AKHBYKNJ4F|" \
r"XN--XKC2AL3HYE2A|XN--LGBBAT1AD8J|XN--MGBC0A9AZCG|XN--9T4B11YI5A|XN--MGBAAM7A8H|XN--MGBAYH7GPA|" \
r"XN--MGBBH1A71E|XN--FPCRJ9C3D|XN--FZC2C9E2C|XN--YFRO4I67O|XN--YGBI2AMMX|XN--3E0B707E|XN--JXALPDLP|" \
r"XN--KGBECHTV|XN--OGBPF8FL|XN--0ZWM56D|XN--45BRJ9C|XN--80AO21A|XN--DEBA0AD|XN--G6W251D|XN--GECRJ9C|" \
r"XN--H2BRJ9C|XN--J6W193G|XN--KPRW13D|XN--KPRY57D|XN--PGBS0DH|XN--S9BRJ9C|XN--90A3AC|XN--FIQS8S|" \
r"XN--FIQZ9S|XN--O3CW4H|XN--WGBH1C|XN--WGBL6A|XN--ZCKZAH|XN--P1AI|MUSEUM|TRAVEL|AERO|ARPA|ASIA|COOP|" \
r"INFO|JOBS|MOBI|NAME|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|TEL|XXX|AC|AD|AE|AF|AG|AI|AL|AM|AN|AO|AQ|" \
r"AR|AS|AT|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BJ|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CC|CD|CF|CG|CH|CI|CK|" \
r"CL|CM|CN|CO|CR|CU|CV|CW|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EE|EG|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE|" \
r"GF|GG|GH|GI|GL|GM|GN|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|IO|IQ|IR|IS|IT|JE|JM|JO|" \
r"JP|KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MG|MH|MK|ML|MM|MN|MO|" \
r"MP|MQ|MR|MS|MT|MU|MV|MW|MX|MY|MZ|NA|NC|NE|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|PA|PE|PF|PG|PH|PK|PL|PM|PN|PR|" \
r"PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SX|SY|SZ|TC|TD|TF|" \
r"TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TT|TV|TW|TZ|UA|UG|UK|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|YE|YT|ZA|ZM|ZW)" \
r"(/\S+)"
reDomain = r"[A-Z0-9\-\.\[\]]+(\.|\[\.\])(XN--CLCHC0EA0B2G2A9GCD|XN--HGBK6AJ7F53BBA|XN--HLCJ6AYA9ESC7A|" \
r"XN--11B5BS3A9AJ6G|XN--MGBERP4A5D4AR|XN--XKC2DL3A5EE0H|XN--80AKHBYKNJ4F|XN--XKC2AL3HYE2A|" \
r"XN--LGBBAT1AD8J|XN--MGBC0A9AZCG|XN--9T4B11YI5A|XN--MGBAAM7A8H|XN--MGBAYH7GPA|XN--MGBBH1A71E|" \
r"XN--FPCRJ9C3D|XN--FZC2C9E2C|XN--YFRO4I67O|XN--YGBI2AMMX|XN--3E0B707E|XN--JXALPDLP|XN--KGBECHTV|" \
r"XN--OGBPF8FL|XN--0ZWM56D|XN--45BRJ9C|XN--80AO21A|XN--DEBA0AD|XN--G6W251D|XN--GECRJ9C|XN--H2BRJ9C|" \
r"XN--J6W193G|XN--KPRW13D|XN--KPRY57D|XN--PGBS0DH|XN--S9BRJ9C|XN--90A3AC|XN--FIQS8S|XN--FIQZ9S|" \
r"XN--O3CW4H|XN--WGBH1C|XN--WGBL6A|XN--ZCKZAH|XN--P1AI|MUSEUM|TRAVEL|AERO|ARPA|ASIA|COOP|INFO|JOBS|" \
r"MOBI|NAME|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|TEL|XXX|AC|AD|AE|AF|AG|AI|AL|AM|AN|AO|AQ|AR|AS|AT" \
r"|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BJ|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CC|CD|CF|CG|CH|CI|CK|CL|CM|" \
r"CN|CO|CR|CU|CV|CW|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EE|EG|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE|GF|GG|" \
r"GH|GI|GL|GM|GN|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|IO|IQ|IR|IS|IT|JE|JM|JO|JP|" \
r"KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MG|MH|MK|ML|MM|MN|MO|MP" \
r"|MQ|MR|MS|MT|MU|MV|MW|MX|MY|MZ|NA|NC|NE|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|PA|PE|PF|PG|PH|PK|PL|PM|PN|PR|" \
r"PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SX|SY|SZ|TC|TD|TF" \
r"|TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TT|TV|TW|TZ|UA|UG|UK|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|YE|YT|ZA|" \
r"ZM|ZW)\b"
reEmail = r"\b[A-Za-z0-9._%+-]+(#|\[#\])[A-Za-z0-9.-]+(\.|\[\.\])(XN--CLCHC0EA0B2G2A9GCD|XN--HGBK6AJ7F53BBA|" \
r"XN--HLCJ6AYA9ESC7A|XN--11B5BS3A9AJ6G|XN--MGBERP4A5D4AR|XN--XKC2DL3A5EE0H|XN--80AKHBYKNJ4F|" \
r"XN--XKC2AL3HYE2A|XN--LGBBAT1AD8J|XN--MGBC0A9AZCG|XN--9T4B11YI5A|XN--MGBAAM7A8H|XN--MGBAYH7GPA|" \
r"XN--MGBBH1A71E|XN--FPCRJ9C3D|XN--FZC2C9E2C|XN--YFRO4I67O|XN--YGBI2AMMX|XN--3E0B707E|XN--JXALPDLP|" \
r"XN--KGBECHTV|XN--OGBPF8FL|XN--0ZWM56D|XN--45BRJ9C|XN--80AO21A|XN--DEBA0AD|XN--G6W251D|XN--GECRJ9C|" \
r"XN--H2BRJ9C|XN--J6W193G|XN--KPRW13D|XN--KPRY57D|XN--PGBS0DH|XN--S9BRJ9C|XN--90A3AC|XN--FIQS8S|" \
r"XN--FIQZ9S|XN--O3CW4H|XN--WGBH1C|XN--WGBL6A|XN--ZCKZAH|XN--P1AI|MUSEUM|TRAVEL|AERO|ARPA|ASIA|COOP|" \
r"INFO|JOBS|MOBI|NAME|BIZ|CAT|COM|EDU|GOV|INT|MIL|NET|ORG|PRO|TEL|XXX|AC|AD|AE|AF|AG|AI|AL|AM|AN|AO|AQ|" \
r"AR|AS|AT|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BJ|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CC|CD|CF|CG|CH|CI|CK" \
r"|CL|CM|CN|CO|CR|CU|CV|CW|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EE|EG|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE" \
r"|GF|GG|GH|GI|GL|GM|GN|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|IO|IQ|IR|IS|IT|JE|JM|" \
r"JO|JP|KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MG|MH|MK|ML|MM|MN" \
r"|MO|MP|MQ|MR|MS|MT|MU|MV|MW|MX|MY|MZ|NA|NC|NE|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|PA|PE|PF|PG|PH|PK|PL|PM|" \
r"PN|PR|PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SX|SY|SZ|TC" \
r"|TD|TF|TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TT|TV|TW|TZ|UA|UG|UK|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|YE|YT|" \
r"ZA|ZM|ZW)\b"
I am using a
with open(file, 'r') as f:
for m in re.finditer(key, text, re.IGNORECASE):
try:
m = str(m).split('match=')[-1].split("'")[1]
new_file.write(m + '\n')
except:
pass
method to open, find and output to a new file.
Any assistance with speeding up this item and making it more efficient would be grateful.
You probably want:
text = m.group(0)
print(text, file=new_file)