I have class named ExcelFile, his job is to manage excel files (read, extract data, and differents things for the stack).
I want to implement a system for managing errors/exceptions.
For example, ExcelFile as a method load(), like a "setup"
def load(self):
"""
Setup for excel file
Load workbook, worksheet and others characteristics (data lines, header...)
:return: Setup successfully or not
:rtype: bool
Current usage
:Example:
> excefile = ExcelFile('test.xls')
> excefile.load()
True
> excefile.nb_rows()
4
"""
self.workbook = xlrd.open_workbook(self.url)
self.sheet = self.workbook.sheet_by_index(0)
self.header_row_index = self.get_header_row_index()
if self.header_row_index == None: # If file doesn't have header (or not valid)
return False
self.header_fields = self.sheet.row_values(self.header_row_index)
self.header_fields_col_ids = self.get_col_ids(self.header_fields) # Mapping between header fields and col ids
self.nb_rows = self.count_rows()
self.row_start_data = self.header_row_index + self.HEADER_ROWS
return True
As you can see, I can encounter 2 differents errors:
The file is not an excel file (raise xlrd.XLRDError)
The file has an invalid header (so I return False)
I want to implement a good management system of ExcelFile errors, because this class is used a lot in the stack.
This is my first idea for processing that :
Implement a standard exception
class ExcelFileException(Exception):
def __init__(self, message, type=None):
self.message = message
self.type = type
def __str__(self):
return "{} : {} ({})".format(self.__class__.__name__, self.message, self.type)
Rewrite load method
def load(self):
"""
Setup for excel file
Load workbook, worksheet and others characteristics (data lines, header...)
:return: Setup successfully or not
:rtype: bool
Current usage
:Example:
> excefile = ExcelFile('test.xls')
> excefile.load()
True
> excefile.nb_rows()
4
"""
try:
self.workbook = xlrd.open_workbook(self.url)
except xlrd.XLRDError as e:
raise ExcelFileException("Unsupported file type", e.__class__.__name__)
self.sheet = self.workbook.sheet_by_index(0)
self.header_row_index = self.get_header_row_index()
if self.header_row_index == None: # If file doesn't have header (or not valid)
raise ExcelFileException("Invalid or empty header")
self.header_fields = self.sheet.row_values(self.header_row_index)
self.header_fields_col_ids = self.get_col_ids(self.header_fields) # Mapping between header fields and col ids
self.nb_rows = self.count_rows()
self.row_start_data = self.header_row_index + self.HEADER_ROWS
return True
And this an example in a calling method, a big problem is I have to manage a dict named "report" with errors in french, for customers success and other.
...
def foo():
...
file = ExcelFile(location)
try:
file.load()
except ExcelFileException as e:
log.warn(e.__str__())
if e.type == 'XLRDError'
self.report['errors'] = 'Long description of the error, in french (error is about invalid file type)'
else:
self.report['errors'] = 'Long description of the error, in french (error is about invalid header)'
...
What do you think about that ? Do you have a better way ?
Thank you
You could change your exception to log the errors in your dict:
class ExcelFileException(Exception):
def __init__(self, message, report, type=None):
report['errors'].append(message)
self.message = message
self.type = type
def __str__(self):
return "{} : {} ({})".format(self.__class__.__name__, self.message, self.type)
When you will raise an exception:
raise ExcelFileException("Invalid or empty header", report)
The errors will be present in self.dictionnary['errors']
Also the error can be fixed by installing missing a optional dependence Xlrd
pip install Xlrd
More available python packages when working with excel
Related
I am working on a project in which a user uploads a file from the front-end and without storing the file it goes to the backend where some processing is done and results are returned back. Here are the functions which are dealing these works in the views.py file:
def midpro(request, *args):
global file
if(request.method == 'POST'):
try:
file = request.FILES['data']
except:
print("No File")
if(file!=None):
if(file.name[-1:-4:-1]!="vsc"):
return render(request, 'mapp/nocsv.html')
else:
return linml(request)
return render(request, 'mapp/nofile.html')
def linml(request, *args):
global retdata
global file
ans = list()
col = ['D1', 'D2']
if(file!=None):
ins = mapp.Mapp(file)
retdata = ins.linml()
for i in zip(col, retdata):
ans.append(i)
context = {
'ans':ans,
'data':file
}
return render(request, 'mapp/linml.html', context)
the code inside Mapp class is
class Mapp:
def __init__(self, file):
self.file = file
def linml(self, *args):
data = pd.read_csv(self.file)
data = np.array(data)
return([np.mean(data), np.var(data)])
pd is the alias for pandas library.
np is the alias for numpy library.
the error I/O operation on closed file is occuring in data = pd.read_csv(self.file) step.
Can anyone tell me how can i resolve this issue?
Also if I try to explicitly open the file with:
with open(self.file) as f:
It shows expected str, bytes or os.PathLike object, not InMemoryUploadedFile error, which i guess means that the file is already opened.
So, anyone please tell me why I/O operation on closed file is happening!
I'm looking at parsing out just the most recent reply/message from an email thread as part of a zap.
I've found this link but how to I use it within a Zap? https://github.com/zapier/email-reply-parser
i.e. when I pick up a thread from gmail how do I just extract the most recent message?
Is this possible in Code by Zapier and if so how?
E.g.
Input:
Yes that is fine, I will email you in the morning.
On Fri, Nov 16, 2012 at 1:48 PM, Zapier wrote:
Our support team just commented on your open Ticket:
"Hi Royce, can we chat in the morning about your question?"
Ouput: i.e. the parsed email:
Yes that is fine, I will email you in the morning.
First off: it's not possible to use that in a code step directly. Python code steps don't have access to external packages.
That said, that package is just Python code, and there's nothing stopping you copying all of the important code into the Code step and using it that way.
It's worth noting that the linked code is pretty old and looks to be unmaintained, so it's unlikely to work without modifications.
I had a go at adapting this https://github.com/zapier/email-reply-parser which seemed to work as well.
"""
email_reply_parser is a python library port of GitHub's Email Reply Parser.
For more information, visit https://github.com/zapier/email-reply-parser
"""
import re
class EmailReplyParser(object):
""" Represents a email message that is parsed.
"""
#staticmethod
def read(text):
""" Factory method that splits email into list of fragments
text - A string email body
Returns an EmailMessage instance
"""
return EmailMessage(text).read()
#staticmethod
def parse_reply(text):
""" Provides the reply portion of email.
text - A string email body
Returns reply body message
"""
return EmailReplyParser.read(text).reply
class EmailMessage(object):
""" An email message represents a parsed email body.
"""
SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
QUOTED_REGEX = re.compile(r'(>+)')
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)
def __init__(self, text):
self.fragments = []
self.fragment = None
self.text = text.replace('\r\n', '\n')
self.found_visible = False
def read(self):
""" Creates new fragment for each line
and labels as a signature, quote, or hidden.
Returns EmailMessage instance
"""
self.found_visible = False
is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text)
if is_multi_quote_header:
self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text)
# Fix any outlook style replies, with the reply immediately above the signature boundary line
# See email_2_2.txt for an example
self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE)
self.lines = self.text.split('\n')
self.lines.reverse()
for line in self.lines:
self._scan_line(line)
self._finish_fragment()
self.fragments.reverse()
return self
#property
def reply(self):
""" Captures reply message within email
"""
reply = []
for f in self.fragments:
if not (f.hidden or f.quoted):
reply.append(f.content)
return '\n'.join(reply)
def _scan_line(self, line):
""" Reviews each line in email message and determines fragment type
line - a row of text from an email message
"""
is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None
is_quoted = self.QUOTED_REGEX.match(line) is not None
is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None
if self.fragment and len(line.strip()) == 0:
if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
self.fragment.signature = True
self._finish_fragment()
if self.fragment \
and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or
(self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))):
self.fragment.lines.append(line)
else:
self._finish_fragment()
self.fragment = Fragment(is_quoted, line, headers=is_header)
def quote_header(self, line):
""" Determines whether line is part of a quoted area
line - a row of the email message
Returns True or False
"""
return self.QUOTE_HDR_REGEX.match(line[::-1]) is not None
def _finish_fragment(self):
""" Creates fragment
"""
if self.fragment:
self.fragment.finish()
if self.fragment.headers:
# Regardless of what's been seen to this point, if we encounter a headers fragment,
# all the previous fragments should be marked hidden and found_visible set to False.
self.found_visible = False
for f in self.fragments:
f.hidden = True
if not self.found_visible:
if self.fragment.quoted \
or self.fragment.headers \
or self.fragment.signature \
or (len(self.fragment.content.strip()) == 0):
self.fragment.hidden = True
else:
self.found_visible = True
self.fragments.append(self.fragment)
self.fragment = None
class Fragment(object):
""" A Fragment is a part of
an Email Message, labeling each part.
"""
def __init__(self, quoted, first_line, headers=False):
self.signature = False
self.headers = headers
self.hidden = False
self.quoted = quoted
self._content = None
self.lines = [first_line]
def finish(self):
""" Creates block of content with lines
belonging to fragment.
"""
self.lines.reverse()
self._content = '\n'.join(self.lines)
self.lines = None
#property
def content(self):
return self._content.strip()
return {'emailstring': EmailReplyParser.parse_reply(input_data['body'])}
I want to parse a large wikipedia dump iteratively. I found a tutorial for this here: https://towardsdatascience.com/wikipedia-data-science-working-with-the-worlds-largest-encyclopedia-c08efbac5f5c
However, when I want to read in the data like this:
data_path = 'C:\\Users\\Me\\datasets\\dewiki-latest-pages-articles1.xml-p1p262468.bz2'
import xml.sax
class WikiXmlHandler(xml.sax.handler.ContentHandler):
"""Content handler for Wiki XML data using SAX"""
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
def characters(self, content):
"""Characters between opening and closing tags"""
if self._current_tag:
self._buffer.append(content)
def startElement(self, name, attrs):
"""Opening tag of element"""
if name in ('title', 'text'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
"""Closing tag of element"""
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self._pages.append((self._values['title'], self._values['text']))
# Object for handling xml
handler = WikiXmlHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
# Iteratively process file
for line in subprocess.Popen(['bzcat'],
stdin = open(data_path),
stdout = subprocess.PIPE,shell=True).stdout:
parser.feed(line)
# Stop when 3 articles have been found
if len(handler._pages) > 3:
break
it seems like nothing happens. The handler._pages list is empty. This is where the parsed articles should be stored. I also added shell=True because otherwise I get the error message FileNotFoundError: [WinError 2].
I never worked with subprocesses in python so I don't know what the problem might be.
I also tried to specify the data_path differently (with / and //).
Thank you in advance.
I'm pulling commit data from the Gerrit API, and the commit number is in the 226,000 range. Where I have to make a request to an endpoint for each and every commit, this is understandable taking a long time. I was wondering how I could best implement threading into my current process.
I have two classes, a Project class, which drills down and retrieves all commits associated with it, and saves them out as a Commit object that contains all the information necessary to then loop through and get the json associated with it. I am pulling them all into a big list, and then iterating through to call the get_data and write_data methods.
class Project(object):
def __init__(self, name):
self.name = name
self.commits = []
def add_commits(self, changes_list):
for change in changes_list:
change_id=change['change_id'],
revision_list=change['revisions']
self.commits.extend([Commit(rid, change_id)
for rid in revision_list.keys()])
def return_results(self, ger_obj, start=0):
self.ger = ger_obj
while True:
endpoint = (r'/changes/?q=project:{project}&o=ALL_REVISIONS&'
r'S={num}'.format(
project=self.name,
num=start
))
logging.info('Endpoint: {}'.format(endpoint))
try:
changes = ger_obj.get(endpoint)
self.add_commits(changes_list=changes)
except HTTPError:
break
start += 500
try:
if not changes[-1].get('_more_changes'):
break
except IndexError:
break
class Commit(object):
def __init__(self, rev_id, change_id):
self.rev_id = rev_id
self.change_id = change_id
def get_data(self, ger_obj):
endpoint = (r'/changes/{c_id}/revisions/{r_id}/commit'.format(
c_id=self.change_id[0],
r_id=self.rev_id
))
try:
self.data = ger_obj.get(endpoint)
except HTTPError as e:
logging.warning('Endpoint: {} did not return data'.format(
endpoint
))
else:
self.data['commitid'] = self.data.get('commit')
self.data['name'] = self.data.get('committer')['name']
self.data['email'] = self.data.get('committer')['email']
self.data['date'] = self.data.get('committer')['date']
hash = md5()
hash.update(json.dumps(self.data).encode('utf-8'))
self.data['etl_checksum_md5'] = hash.hexdigest()
self.data['etl_process_status'] = ETL_PROCESS_STATUS
self.data['etl_datetime_local'] = ETL_DATETIME_LOCAL
self.data['etl_pdi_version'] = ETL_PDI_VERSION
self.data['etl_pdi_build_version'] = ETL_PDI_BUILD_VERSION
self.data['etl_pdi_hostname'] = ETL_PDI_HOSTNAME
self.data['etl_pdi_ipaddress'] = ETL_PDI_IPADDRESS
self.data['message'] = self.data['message'].replace('\n', ' ').replace('|', '[pipe]')
def write_data(self, writer):
writer.writerow(self.data)
I'm thinking that the best place to implement the threads is once I have all the commits in a list and am ready to iterate over them:
projects = [Project(value['id']) for value in project_data.values()]
for project in projects[:10]:
if project.name in bad_names.keys():
project.name = bad_names[project.name]
project.return_results(rest)
all_commits.extend(project.commits)
fieldnames = get_fieldnames(
'ods_gerrit.staging_gerrit_commits',
REDSHIFT_POSTGRES_INFO)
with open('testfile.csv', 'wb') as outf:
writer = DictWriter(
outf,
fieldnames=fieldnames,
extrasaction='ignore',
delimiter='|'
)
# Implement Threading?
for commit in all_commits:
commit.get_data(rest)
try:
commit.write_data(writer=writer)
except AttributeError:
continue
except Exception:
print commit.data, 'caused an exception.'
continue
I've read a few threading tutorials, and am unsure as to how to properly do this. I'm particularly worried about overwriting data due to improper locking.
Using urllibs (or urllibs2) and wanting what I want is hopeless.
Any solution?
I'm not sure how the C# implementation works, but, as internet streams are generally not seekable, my guess would be it downloads all the data to a local file or in-memory object and seeks within it from there. The Python equivalent of this would be to do as Abafei suggested and write the data to a file or StringIO and seek from there.
However, if, as your comment on Abafei's answer suggests, you want to retrieve only a particular part of the file (rather than seeking backwards and forwards through the returned data), there is another possibility. urllib2 can be used to retrieve a certain section (or 'range' in HTTP parlance) of a webpage, provided that the server supports this behaviour.
The range header
When you send a request to a server, the parameters of the request are given in various headers. One of these is the Range header, defined in section 14.35 of RFC2616 (the specification defining HTTP/1.1). This header allows you to do things such as retrieve all data starting from the 10,000th byte, or the data between bytes 1,000 and 1,500.
Server support
There is no requirement for a server to support range retrieval. Some servers will return the Accept-Ranges header (section 14.5 of RFC2616) along with a response to report if they support ranges or not. This could be checked using a HEAD request. However, there is no particular need to do this; if a server does not support ranges, it will return the entire page and we can then extract the desired portion of data in Python as before.
Checking if a range is returned
If a server returns a range, it must send the Content-Range header (section 14.16 of RFC2616) along with the response. If this is present in the headers of the response, we know a range was returned; if it is not present, the entire page was returned.
Implementation with urllib2
urllib2 allows us to add headers to a request, thus allowing us to ask the server for a range rather than the entire page. The following script takes a URL, a start position, and (optionally) a length on the command line, and tries to retrieve the given section of the page.
import sys
import urllib2
# Check command line arguments.
if len(sys.argv) < 3:
sys.stderr.write("Usage: %s url start [length]\n" % sys.argv[0])
sys.exit(1)
# Create a request for the given URL.
request = urllib2.Request(sys.argv[1])
# Add the header to specify the range to download.
if len(sys.argv) > 3:
start, length = map(int, sys.argv[2:])
request.add_header("range", "bytes=%d-%d" % (start, start + length - 1))
else:
request.add_header("range", "bytes=%s-" % sys.argv[2])
# Try to get the response. This will raise a urllib2.URLError if there is a
# problem (e.g., invalid URL).
response = urllib2.urlopen(request)
# If a content-range header is present, partial retrieval worked.
if "content-range" in response.headers:
print "Partial retrieval successful."
# The header contains the string 'bytes', followed by a space, then the
# range in the format 'start-end', followed by a slash and then the total
# size of the page (or an asterix if the total size is unknown). Lets get
# the range and total size from this.
range, total = response.headers['content-range'].split(' ')[-1].split('/')
# Print a message giving the range information.
if total == '*':
print "Bytes %s of an unknown total were retrieved." % range
else:
print "Bytes %s of a total of %s were retrieved." % (range, total)
# No header, so partial retrieval was unsuccessful.
else:
print "Unable to use partial retrieval."
# And for good measure, lets check how much data we downloaded.
data = response.read()
print "Retrieved data size: %d bytes" % len(data)
Using this, I can retrieve the final 2,000 bytes of the Python homepage:
blair#blair-eeepc:~$ python retrieverange.py http://www.python.org/ 17387
Partial retrieval successful.
Bytes 17387-19386 of a total of 19387 were retrieved.
Retrieved data size: 2000 bytes
Or 400 bytes from the middle of the homepage:
blair#blair-eeepc:~$ python retrieverange.py http://www.python.org/ 6000 400
Partial retrieval successful.
Bytes 6000-6399 of a total of 19387 were retrieved.
Retrieved data size: 400 bytes
However, the Google homepage does not support ranges:
blair#blair-eeepc:~$ python retrieverange.py http://www.google.com/ 1000 500
Unable to use partial retrieval.
Retrieved data size: 9621 bytes
In this case, it would be necessary to extract the data of interest in Python prior to any further processing.
It may work best just to write the data to a file (or even to a string, using StringIO), and to seek in that file (or string).
I did not find any existing implementations of a file-like interface with seek() to HTTP URLs, so I rolled my own simple version: https://github.com/valgur/pyhttpio. It depends on urllib.request but could probably easily be modified to use requests, if necessary.
The full code:
import cgi
import time
import urllib.request
from io import IOBase
from sys import stderr
class SeekableHTTPFile(IOBase):
def __init__(self, url, name=None, repeat_time=-1, debug=False):
"""Allow a file accessible via HTTP to be used like a local file by utilities
that use `seek()` to read arbitrary parts of the file, such as `ZipFile`.
Seeking is done via the 'range: bytes=xx-yy' HTTP header.
Parameters
----------
url : str
A HTTP or HTTPS URL
name : str, optional
The filename of the file.
Will be filled from the Content-Disposition header if not provided.
repeat_time : int, optional
In case of HTTP errors wait `repeat_time` seconds before trying again.
Negative value or `None` disables retrying and simply passes on the exception (the default).
"""
super().__init__()
self.url = url
self.name = name
self.repeat_time = repeat_time
self.debug = debug
self._pos = 0
self._seekable = True
with self._urlopen() as f:
if self.debug:
print(f.getheaders())
self.content_length = int(f.getheader("Content-Length", -1))
if self.content_length < 0:
self._seekable = False
if f.getheader("Accept-Ranges", "none").lower() != "bytes":
self._seekable = False
if name is None:
header = f.getheader("Content-Disposition")
if header:
value, params = cgi.parse_header(header)
self.name = params["filename"]
def seek(self, offset, whence=0):
if not self.seekable():
raise OSError
if whence == 0:
self._pos = 0
elif whence == 1:
pass
elif whence == 2:
self._pos = self.content_length
self._pos += offset
return self._pos
def seekable(self, *args, **kwargs):
return self._seekable
def readable(self, *args, **kwargs):
return not self.closed
def writable(self, *args, **kwargs):
return False
def read(self, amt=-1):
if self._pos >= self.content_length:
return b""
if amt < 0:
end = self.content_length - 1
else:
end = min(self._pos + amt - 1, self.content_length - 1)
byte_range = (self._pos, end)
self._pos = end + 1
with self._urlopen(byte_range) as f:
return f.read()
def readall(self):
return self.read(-1)
def tell(self):
return self._pos
def __getattribute__(self, item):
attr = object.__getattribute__(self, item)
if not object.__getattribute__(self, "debug"):
return attr
if hasattr(attr, '__call__'):
def trace(*args, **kwargs):
a = ", ".join(map(str, args))
if kwargs:
a += ", ".join(["{}={}".format(k, v) for k, v in kwargs.items()])
print("Calling: {}({})".format(item, a))
return attr(*args, **kwargs)
return trace
else:
return attr
def _urlopen(self, byte_range=None):
header = {}
if byte_range:
header = {"range": "bytes={}-{}".format(*byte_range)}
while True:
try:
r = urllib.request.Request(self.url, headers=header)
return urllib.request.urlopen(r)
except urllib.error.HTTPError as e:
if self.repeat_time is None or self.repeat_time < 0:
raise
print("Server responded with " + str(e), file=stderr)
print("Sleeping for {} seconds before trying again".format(self.repeat_time), file=stderr)
time.sleep(self.repeat_time)
A potential usage example:
url = "https://www.python.org/ftp/python/3.5.0/python-3.5.0-embed-amd64.zip"
f = SeekableHTTPFile(url, debug=True)
zf = ZipFile(f)
zf.printdir()
zf.extract("python.exe")
Edit: There is actually a mostly identical, if slightly more minimal, implementation in this answer: https://stackoverflow.com/a/7852229/2997179