how to Improve execution time of importing data in python

how to Improve execution time of importing data in python - python

Below code take 2.5 seconds to import a log file with 1 million lines of code.
Is there a better way to the code and also decrease the execution time ?
""" This code is used to read the log file into the memory and convert into the data frame
Once the log file is loaded ,every item in the IPQuery file checked if exist and result is print onto the console"""
#importing python modules required for this script to perform operations
import pandas as pd
import time
import sys
#code to check the arguments passed """
if len(sys.argv)!= 3:
raise ValueError(""" PLEASE PASS THE BOTH LOG FILE AND IPQUERY FILE AS INPUT TO SCRIPT
ex: python program.py log_file query_file """)
# extracting file names from command line """
log_file_name=sys.argv[1]
query_file_name = sys.argv[2]
start = time.time()#capturing time instance
#Reading the content from the log file into dataframe log_df """
log_df = pd.read_csv(log_file_name," ",header=None ,names = ['DATE','TIME', 'IPADDR','URL','STATUS'],skip_blank_lines = True)
#Reading the content from the IPquery file into the data frame query_df """
query_df = pd.read_csv(query_file_name," ",header=None,skip_blank_lines=True )
#Cheking if the IP address exists in the log file"""
Ipfound = query_df.isin(log_df.IPADDR).astype(int)
#print all the results to the Query results onto the stdout"""
for items in Ipfound[0]:
print items
print "Execution Time of this script is %f" %(time.time() - start)

#importing python modules required for this script to perform operations
import time
import sys
start = time.time()#capturing time instance
class IpQuery:
"""Below methods contain the functionality to read file paths ,import log and query data
and provide the result to the console """
def __init__(self):
self.log_file_name= ""
self.query_file_name = ""
self.logset = {}
self.IPlist= []
def Inputfiles(self):
"""code to check the arguments passed and throw an error """
if len(sys.argv)!= 3:
raise ValueError(""" PLEASE PASS THE BOTH LOG FILE AND IPQUERY FILE AS INPUT TO SCRIPT
ex: python program.py log_file query_file """)
# extracting file names from command line
self.log_file_name=sys.argv[1]
self.query_file_name = sys.argv[2]
def read_logfile(self):
#Reading the log data
with open(self.log_file_name,'r') as f:
self.logset = {line.split(' ')[2] for line in f if not line.isspace()}
def read_Queryfile(self):
#Reading the query file into the dataframe"""
with open(self.query_file_name,'r') as f:
self.IPlist = [line.rstrip('\n') for line in f if not line.isspace() ]
def CheckIpAdress(self):
#Ip address from query file ae checked against the log file """
dummy= self.logset.intersection(set(self.IPlist))
for element in self.IPlist:
if element in dummy:
print "1"
else :
print "0"
try:
#Create an instance of the IpQuery file
msd=IpQuery()
#Extracting the input file information
msd.Inputfiles()
#Importing the Ip information from the log files
msd.read_logfile()
#Importing the Ipquery information from the query file
msd.read_Queryfile()
#Searching for the Ip in log file
msd.CheckIpAdress()
except IOError:
print "Error: can\'t find file or read data"
except ValueError :
print "PLEASE PASS THE BOTH LOG FILE AND IPQUERY FILE AS INPUT TO SCRIPT "

Related

Script to convert multiple URLs or files to individual PDFs and save to a specific location

I have written a script where I am taking the input of URLs hardcoded and giving their filenames also hardcoded, whereas I want to take the URLs from a saved text file and save their names automatically in a chronological order to a specific folder.
My code (works) :
import requests
#input urls and filenames
urls = ['https://www.northwestknowledge.net/metdata/data/pr_1979.nc',
'https://www.northwestknowledge.net/metdata/data/pr_1980.nc',
'https://www.northwestknowledge.net/metdata/data/pr_1981.nc']
fns = [r'C:\Users\HBI8\Downloads\pr_1979.nc',
r'C:\Users\HBI8\Downloads\pr_1980.nc',
r'C:\Users\HBI8\Downloads\pr_1981.nc']
#defining the inputs
inputs= zip(urls,fns)
#define download function
def download_url(args):
url, fn = args[0], args[1]
try:
r = requests.get(url)
with open(fn, 'wb') as f:
f.write(r.content)
except Exception as e:
print('Failed:', e)
#loop through all inputs and run download function
for i in inputs :
result = download_url(i)
Trying to fetch the links from text (error in code):
import requests
# getting all URLS from textfile
file = open('C:\\Users\\HBI8\\Downloads\\testing.txt','r')
#for each_url in enumerate(f):
list_of_urls = [(line.strip()).split() for line in file]
file.close()
#input urls and filenames
urls = list_of_urls
fns = [r'C:\Users\HBI8\Downloads\pr_1979.nc',
r'C:\Users\HBI8\Downloads\pr_1980.nc',
r'C:\Users\HBI8\Downloads\pr_1981.nc']
#defining the inputs
inputs= zip(urls,fns)
#define download function
def download_url(args):
url, fn = args[0], args[1]
try:
r = requests.get(url)
with open(fn, 'wb') as f:
f.write(r.content)
except Exception as e:
print('Failed:', e)
#loop through all inputs and run download fupdftion
for i in inputs :
result = download_url(i)
testing.txt has those 3 links pasted in it on each line.
Error :
Failed: No connection adapters were found for "['https://www.northwestknowledge.net/metdata/data/pr_1979.nc']"
Failed: No connection adapters were found for "['https://www.northwestknowledge.net/metdata/data/pr_1980.nc']"
Failed: No connection adapters were found for "['https://www.northwestknowledge.net/metdata/data/pr_1981.nc']"
PS :
I am new to python and it would be helpful if someone could advice me on how to loop or go through files from a text file and save them indivually in a chronological order as opposed to hardcoding the names(as I have done).

When you do list_of_urls = [(line.strip()).split() for line in file], you produce a list of lists. (For each line of the file, you produce the list of urls in this line, and then you make a list of these lists)
What you want is a list of urls.
You could do
list_of_urls = [url for line in file for url in (line.strip()).split()]
Or :
list_of_urls = []
for line in file:
list_of_urls.extend((line.strip()).split())

By far the simplest method in this simple case is use the OS command
so go to the work directory C:\Users\HBI8\Downloads
invoke cmd (you can simply put that in the address bar)
write/paste your list using >notepad testing.txt (if you don't already have it there)
Note NC HDF files are NOT.pdf
https://www.northwestknowledge.net/metdata/data/pr_1979.nc
https://www.northwestknowledge.net/metdata/data/pr_1980.nc
https://www.northwestknowledge.net/metdata/data/pr_1981.nc
then run
for /F %i in (testing.txt) do curl -O %i
92 seconds later

I have inserted a delimiter as ',' by using split function.
In order to give automated file name I used the index number of the stored list.
Data saved in following manner in txt file.
FileName | Object ID | Base URL
url_file = open('C:\\Users\\HBI8\\Downloads\\testing.txt','r')
fns=[]
list_of_urls = []
for line in url_file:
stripped_line = line.split(',')
print(stripped_line)
list_of_urls.append(stripped_line[2]+stripped_line[1])
fns.append(stripped_line[0])
url_file.close()

How can I wait for text file to be written, read changes line by line and print?

I have a code which will open a .text file in append mode, generate a random string and append those string in the file in a time gap.
Program: Random_Generation
import os
import sys
import string
import random
import time
class RandomCreation():
def __init__(self, path):
self.path = path
def string_generator(self, size):
chars = string.ascii_uppercase + string.ascii_lowercase
return ''.join(random.choice(chars) for _ in range(size))
if __name__ == '__main__':
path = os.path.realpath(sys.argv[1])
try:
while True:
random_cration = RandomCreation(path)
data = random_cration.string_generator(10)
f = open(f'{path}/try_file.txt', 'a+')
f.write(data +"\n")
print('Successfully Appended')
time.sleep(2)
except KeyboardInterrupt:
f.close()
print('Exiting......')
exit()
This program running perfectly.
I want to write another program(called File_Handling) which will open the same .text file and will read the file line by line. This program is dependent on the above-mentationed Program Named Random_Generation. This File_Handling program will wait until the text file generated. After the generation of the text file, it will read the file line by line and print the content, while it's still being written by the Random_Generation program. If there is no new line to be read, then it will wait for a new line, but it does not stop itself. So, when new lines are available then, I would like to print the new lines only.
Program: File_Handling
import sys
import os
import os.path
import time
path = os.path.realpath(sys.argv[1])
filename = f'{path}/try_file.txt'
while not os.path.exists(filename):
time.sleep(1)
if os.path.exists(filename):
file = open(filename, "r")
for num, line in enumerate(file, 1):
print(f'{num}: {line}')
file.close()
Both codes are running parallel. The condition of the program File_Handling:
If the text file is empty then it will wait for the availability of a new line
If there exist some lines in the text file then, it will read those lines, line by line until the end of the file.
After reaching at the end of the file, it will wait for the availability of a new line.
It never goes back to those lines which were read before.
Writing and reading program both are not synchronized.
There can be written at any time of point.
The only fact that after write read will occur until the end of the file.
To reach the goal, it necessarily wants semaphore and maybe async. But should I incorporate them?
I can not able to do as required in File_Handling program, Please Help!!!

Iterating Over List of Parsed Files Python

This program scans through a log file and finds faults and timestamps for the faults. The problem I am having with my program is finding a way to modify my program so that it can iterate over multiple files given via the command line and wildcard. In the state the code is now, it can accept a single file and build the dictionary with my my desired info successfully. I have been struggling finding a way to perform this with multiple files simultaneously. The goal is to able to enter into the command line the filename with a wildcard to parse files associated. For example on the command line after the executable I would enter, -f filename.*txt**. However, I cannot find a way to parse multiple files through my fault finder. I have been successful in parsing multiple files and proved it by printing out the list of files parsed. But when it comes to using multiple files and building the dictionary, I am stumped. I would like to use my program and have the same result as it would when parsing a singular file.
import sys
import argparse
_TIME_STAMP_LENGTH = 16
_FAULT_STRING_HEADER_LENGTH = 15
class FaultList():
fault_dict = {}
fault_dict_counter = {}
def __init__(self, file):
self.file = file
self.find_faults()
print self.fault_dict
def find_faults(self):
with open(self.file) as f:
for line in f.readlines():
fault_index = line.find("Fault Cache id")
if(fault_index != -1):
time_stamp = line[:_TIME_STAMP_LENGTH]
fault_data = line[fault_index+_FAULT_STRING_HEADER_LENGTH:-11][:-1] #need the [:-1] to remove new line from string
self.handle_new_fault_found(fault_data, time_stamp)
def handle_new_fault_found(self, fault, time_stamp):
try:
self.fault_dict[fault] = [fault]
self.fault_dict[fault].append(int(time_stamp))
self.fault_dict_counter[0] += 1
except KeyError:
self.fault_dict_counter[fault] = [1, [time_stamp]]
def main(file_names):
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", dest="file_names",
help="The binary file to be writen to flash")
args = parser.parse_args()
fault_finder = FaultList(args.file_names)
args = parser.parse_args()
if __name__ == '__main__':
main(sys.argv[1:])
Here is the output of dictionary when parsing a singular file
{'fault_01_17_00 Type:Warning': ['fault_01_17_00 Type:Warning', 37993146319], 'fault_0E_00_00 Type:Warning': ['fault_0E_00_00 Type:Warning', 38304267561], 'fault_05_01_00 Typ': ['fault_05_01_00 Typ', 38500887160]}

You can use the os module for listing files.
import os
# finds all files in a directory
files = [file for file in os.listdir('path of the files') if os.path.isfile(file)]
# filter them looking for files that end with 'txt'
txt_files = [file for file in files if file.endswith('txt')]

Is there a way to log auto generated messages on Python console?

I'm using pandas to load a csv file that has few bad lines. This means that in few lines there are some extra commas and that is why pandas is not able to load it. Which is fine by me. I'm using error_bad_lines=False to ignore those lines. When those bad lines are ignored by pandas, it shows a message like this on console:
b'Skipping line 3: expected 3 fields, saw 4\n
What I want is to be able to load the data but log this skipping line number in a log file. I went throught a lot of tutorials on logging but couldn't find a way to log this auto generated message when pandas skip a line number while loading the data.
This is the simple piece of code I'm using to load a file.
import pandas as pd
import os
def main():
filename = "test_data3.csv"
data= pd.read_csv(filename,error_bad_lines=False)
print(data.head())
if __name__=="__main__":
main()
Here is the sample data I'm using
Col1,Col2,Col3
a,3,g4
b,4,s5,r
c,5,p9
f,6,v4,7
x,65,h5
as you can see line number 2 and 4 should be skipped. But it needs to be recorded in a log file.

You can use a context manager to temporarily intercept calls to sys.stderr.write and write the messages to a file:
import pandas as pd
import sys
class CaptureErrors:
def __init__(self, stderr, output_name):
self.stderr = stderr
self.output_name = output_name
self.output_file = None
def __enter__(self):
self.output_file = open(self.output_name, "w")
return self
def __exit__(self, exc_type, exc_value, traceback):
if self.output_file:
self.output_file.close()
sys.stderr = self.stderr
def write(self, message):
self.stderr.write(message)
self.output_file.write(message)
def main():
filename = "test_data3.csv"
with CaptureErrors(sys.stderr, 'error.txt') as sys.stderr:
data = pd.read_csv(filename, error_bad_lines=False)
print(data.head())
if __name__=="__main__":
main()
If this isn't what you are looking for, you may need to add more information to your question.

You can use Redirect the output into a file doubg:
python script.py > out.txt

Reading appengine backup_info file gives EOFError

I'm trying to inspect my appengine backup files to work out when a data corruption occured. I used gsutil to locate and download the file:
gsutil ls -l gs://my_backup/ > my_backup.txt
gsutil cp gs://my_backup/LongAlphaString.Mymodel.backup_info file://1.backup_info
I then created a small python program, attempting to read the file and parse it using the appengine libraries.
#!/usr/bin/python
APPENGINE_PATH='/Applications/GoogleAppEngineLauncher.app/Contents/Resources/GoogleAppEngine-default.bundle/Contents/Resources/google_appengine/'
ADDITIONAL_LIBS = [
'lib/yaml/lib'
]
import sys
sys.path.append(APPENGINE_PATH)
for l in ADDITIONAL_LIBS:
sys.path.append(APPENGINE_PATH+l)
import logging
from google.appengine.api.files import records
import cStringIO
def parse_backup_info_file(content):
"""Returns entities iterator from a backup_info file content."""
reader = records.RecordsReader(cStringIO.StringIO(content))
version = reader.read()
if version != '1':
raise IOError('Unsupported version')
return (datastore.Entity.FromPb(record) for record in reader)
INPUT_FILE_NAME='1.backup_info'
f=open(INPUT_FILE_NAME, 'rb')
f.seek(0)
content=f.read()
records = parse_backup_info_file(content)
for r in records:
logging.info(r)
f.close()
The code for parse_backup_info_file was copied from
backup_handler.py
When I run the program, I get the following output:
./view_record.py
Traceback (most recent call last):
File "./view_record.py", line 30, in <module>
records = parse_backup_info_file(content)
File "./view_record.py", line 19, in parse_backup_info_file
version = reader.read()
File "/Applications/GoogleAppEngineLauncher.app/Contents/Resources/GoogleAppEngine-default.bundle/Contents/Resources/google_appengine/google/appengine/api/files/records.py", line 335, in read
(chunk, record_type) = self.__try_read_record()
File "/Applications/GoogleAppEngineLauncher.app/Contents/Resources/GoogleAppEngine-default.bundle/Contents/Resources/google_appengine/google/appengine/api/files/records.py", line 307, in __try_read_record
(length, len(data)))
EOFError: Not enough data read. Expected: 24898 but got 2112
I've tried with a half a dozen different backup_info files, and they all show the same error (with different numbers.)
I have noticed that they all have the same expected length: I was reviewing different versions of the same model when I made that observation, it's not true when I view the backup files of other Modules.
EOFError: Not enough data read. Expected: 24932 but got 911
EOFError: Not enough data read. Expected: 25409 but got 2220
Is there anything obviously wrong with my approach?
I guess the other option is that the appengine backup utility is not creating valid backup files.
Anything else you can suggest would be very welcome.
Thanks in Advance

There are multiple metadata files created when an AppEngine Datastore backup is run:
LongAlphaString.backup_info is created once. This contains metadata about all of the entity types and backup files that were created in datastore backup.
LongAlphaString.[EntityType].backup_info is created once per entity type. This contains metadata about the the specific backup files created for [EntityType] along with schema information for the [EntityType].
Your code works for interrogating the file contents of LongAlphaString.backup_info, however it seems that you are trying to interrogate the file contents of LongAlphaString.[EntityType].backup_info. Here's a script that will print the contents in a human-readable format for each file type:
import cStringIO
import os
import sys
sys.path.append('/usr/local/google_appengine')
from google.appengine.api import datastore
from google.appengine.api.files import records
from google.appengine.ext.datastore_admin import backup_pb2
ALL_BACKUP_INFO = 'long_string.backup_info'
ENTITY_KINDS = ['long_string.entity_kind.backup_info']
def parse_backup_info_file(content):
"""Returns entities iterator from a backup_info file content."""
reader = records.RecordsReader(cStringIO.StringIO(content))
version = reader.read()
if version != '1':
raise IOError('Unsupported version')
return (datastore.Entity.FromPb(record) for record in reader)
print "*****" + ALL_BACKUP_INFO + "*****"
with open(ALL_BACKUP_INFO, 'r') as myfile:
parsed = parse_backup_info_file(myfile.read())
for record in parsed:
print record
for entity_kind in ENTITY_KINDS:
print os.linesep + "*****" + entity_kind + "*****"
with open(entity_kind, 'r') as myfile:
backup = backup_pb2.Backup()
backup.ParseFromString(myfile.read())
print backup

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to Improve execution time of importing data in python - python

Related

Script to convert multiple URLs or files to individual PDFs and save to a specific location

How can I wait for text file to be written, read changes line by line and print?

Iterating Over List of Parsed Files Python

Is there a way to log auto generated messages on Python console?

Reading appengine backup_info file gives EOFError

Categories

Resources