How to extract application/zip from api response? - python

I have got an application/octect-stream with a application/zip as body in requests.Response object returned from an api call with a csv file inside it. I am trying to read the csv file to pandas without writing to the disk, if possible.
And if I want to write the zip file to a path as a zip file, how can I do that?
resp = requests.get(url, headers=headers)
resp.raise_for_status()
csv_obj = zlib.decompress(resp.content, wbits=zlib.MAX_WBITS|32)
print(type(csv_obj))
export_file = pd.read_csv(csv_obj)
export_file.to_csv('./Test_export.csv')

Updated version
# step 1: it turns out pandas can read zipped csv files even from urls!
some_dataframe = pandas.read_csv(url)
If pandas can't figure it out by itself there are some parameters you can try to massage.
# step 1: it turns out pandas can read zipped csv files even from urls!
some_dataframe = pandas.read_csv(zip_filename, compression='zip', header=0) # etc..
Previous version
I will leave the previous version of my answer below for reference.
# step 1: downloading the zip file
zip_filename = 'response.zip'
with open(zip_filename, 'wb') as zip_file:
for chunk in response.iter_content(chunk_size=255):
if chunk:
zip_file.write(chunk)
# step 2: turns out pandas can read zipped csv files!
some_dataframe = pandas.read_csv(zip_filename)

import pandas as pd
import io
import zipfile
resp = requests.get(url, headers=headers, stream=True)
resp.raise_for_status()
zfile = zipfile.ZipFile(io.BytesIO(resp.content))
# I only had one file, so calling zfile.namelist
export_file = pd.read_csv(zfile.open(f'{zfile.namelist()[-1]}'))

Related

python download folder of text files

The goal is to download GTFS data through python web scraping, starting with https://transitfeeds.com/p/agence-metropolitaine-de-transport/129/latest/download
Currently, I'm using requests like so:
def download(url):
fpath = "prov/city/GTFS"
r = requests.get(url)
if r.ok:
print("Saving file.")
open(fpath, "wb").write(r.content)
else:
print("Download failed.")
The results of requests.content of the above url unfortunately renders the following:
You can see the files of interest within the output (e.g. stops.txt) but how might I access them to read/write?
I fear you're trying to read a zip file with a text editor, perhaps you should try using the "zipfile" module.
The following worked:
def download(url):
fpath = "path/to/output/"
f = requests.get(url, stream = True, headers = headers)
if f.ok:
print("Saving to {}".format(fpath))
g=open(fpath+'output.zip','wb')
g.write(f.content)
g.close()
else:
print("Download failed with error code: ", f.status_code)
You need to write this file into a zip.
import requests
url = "https://transitfeeds.com/p/agence-metropolitaine-de-transport/129/latest/download"
fname = "gtfs.zip"
r = requests.get(url)
open(fname, "wb").write(r.content)
Now fname exists and has several text files inside. If you want to programmatically extract this zip and then read the content of a file, for example stops.txt, then you need first to extract a single file, or simply extractall.
import zipfile
# this will extract only a single file, and
# raise a KeyError if the file is missing from the archive
zipfile.ZipFile(fname).extract("stops.txt")
# this will extract all the files found from the archive,
# overwriting files in the process
zipfile.ZipFile(fname).extractall()
Now you just need to work with your file(s).
thefile = "stops.txt"
# just plain text
text = open(thefile).read()
# csv file
import csv
reader = csv.reader(open(thefile))
for row in reader:
...

how to store bytes like b'PK\x03\x04\x14\x00\x08\x08\x08\x009bwR\x00\x00\x00\x00\x00\x00\x00 to dataframe or csv in python

I am requesting a URL and getting a return in bytes. I want to store this in a data frame and then to CSV.
#Get Data from the CSV
url = "someURL"
req = requests.get(URL)
url_content = req.content
csv_file = open('test.txt', 'wb')
print(type(url_content))
print(url_content)
csv_file.write(url_content)
csv_file.close()
I tried many approaches, but couldn't find the solution. The above code is storing the output in CSV, but getting the below error. My end objective is to store this in CSV then send it to google cloud. And create a google big query table.
Output:
<class 'bytes'>
b'PK\x03\x04\x14\x00\x08\x08\x08\x009bwR\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x13\x00\x00\x00[Content_Types].xml\xb5S\xcbn\xc20\x10\xfc\x95\xc8\xd76\xf4PU\x15\x81C\x1f\xc7\x16\xa9\xf4\x03\{\x93X\xf8%\xaf\xa1\xf0\xf7]\x078\x94R\x89\nq\xf2cfgfW\xf6d\xb6q\xb6ZCB\x13|\xc3\xc6|\xc4\xf0h\xe3\xbb\x86},^\xea{Va\x96^K\x1b<4\xcc\x076\x9bN\x16\xdb\x08XQ\xa9\xc7\x86\xf59\xc7\x07!P\xf5\xe0$\xf2\x10\xc1\x13\xd2\x86\xe4d\xa6c\xeaD\x94j);\x10\xb7\xa3\xd1\x9dP\xc1g\xf0\xb9\xceE\x83M'O\xd0\xca\x95\xcd\xd5\xe3\xee\xbeH7L\xc6h\x8d\x92\x99R\x89\xb5\xd7G\xa2\xf5^\x90'\xb0\x03\x07{\x13\xf1\x86\x08\xacz\xde\x90\xca\xae\x1bB\x91\x893\x1c\x8e\x0b\xcb\x99\xea\xdeh.\xc9h\xf8W\xb4\xd0\xb6F\x81\x0ej\xe5\xa8\x84CQ\xd5\xa0\xeb\x98\x88\x98\xb2\x81}\xce\xb9L\xf9U:\x12\x14D\x9e\x13\x8a\x82\xa4\xf9%\xde\x87\xb1\xa8\x90\xe0,\xc3B\xbc\xc8\xf1\xa8[\x8c\t\xa4\xc6\x1e ;\xcb\xb1\x97\t\xf4{N\xf4\x98~\x87\xd8X\xf1\x83p\xc5\x1cykOL\xa1\x04\x18\x90kN\x80V\xee\xa4\xf1\xa7\xdc\xbfBZ~\x86\xb0\xbc\x9e\x7fq\x18\xf6\x7f\xd9\x0f \x8aa\x19\x1fr\x88\xe1{O\xbf\x01PK\x07\x08z\x94\xcaq;\x01\x00\x00\x1c\x04\x00\x00PK\x03\x04\x14\x00\x08\x08\x08\x009bwR\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b\x00\x00\x00_rels/.rels\xad\x92\xc1j\xc30\x0c\x86_\xc5\xe8\xde8\xed`\x8cQ\xb7\x972\xe8m\x8c\xee\x014[ILb\xcb\xd8\xda\x96\xbd\xfd\xcc.[K\n\x1b\xec($}\xff\x07\xd2v?\x87I\xbdQ.\x9e\xa3\x81u\xd3\x82\xa2h\xd9\xf9\xd8\x1bx>=\xac\xee#\x15\xc1\xe8p\xe2H\x06"\xc3~\xb7}\xa2\t\xa5n\x94\xc1\xa7\xa2"\x16\x03\x83H\xba\xd7\xba\xd8\x81\x02\x96\x86\x13\xc5\xda\xe98\x07\x94Z\xe6^'\xb4#\xf6\xa47m{\xab\xf3O\x06\x9c3\xd5\xd1\x19\xc8G\xb7\x06u\xc2\xdc\x93\x18\x98'\xfd\xcey|a\x1e\x9b\x8a\xad\x8d\x8fD\xbf\t\xe5\xae\xf3\x96\x0el_\x03EY\xc8\xbe\x98\x00\xbd\xec\xb2\xf9vql\x1f3\xd7ML\xe9\xbfeh\x16\x8a\x8e\xdc*\xd5\x04\xca\xe2\xa9\3\xbaY0\xb2\x9c\xe9oJ\xd7\x8f\xa2\x03\t:\x14\xfc\xa2^\x08\xe9\xb3\x1f\xd8}\x02PK\x07\x08\xa7\x8cz\xbd\xe3\x00\x00\x00I\x02\x00\x00PK\x03\x04\x14\x00\x08\x08\x08\x009bwR\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00docProps/app.xmlM\x8e\xc1\n\xc20\x10D\xef~E\xc8\xbd\xdd\xeaAD\xd2\x94\x82\x08\x9e\xecA? \xa4\xdb6\xd0lB\xb2J?
The original URL (now edited out of the question) suggests that the downloaded file is in .xlsx format. The .xlsx format is essentially one or more xml files in a zip archive (iBug's answer is correct in this respect).
Therefore if you want to get the file's data in a dataframe, tell Pandas to read it as an excel file.
import pandas as pd
url = "someURL"
req = requests.get(URL)
url_content = req.content
# Load into a dataframe
df = pd.read_excel(url_content)
# Write to csv
df.to_csv('data.csv')
The initial bytes PK\x03\x04 suggest that it's PK Zip format. Try unzipping it first, either with unzip x <filename> or with Python builtin zipfile module.

Download .txt file and extract file name

I am trying to download the file in Python from the url https://marketdata.theocc.com/position-limits?reportType=change.
I am able to convert it to DataFrame just by using:
df = pd.read_csv('https://marketdata.theocc.com/position-limits?reportType=change')
But what I want is to obtain the name of the file also.
so, if you download the file directly from browser the name of the file obtain is "POSITIONLIMITCHANGE_20201202.txt".
Can someone suggest an efficient way to do this in Python?
Thanks.
if you are using the requests library, the information about the file is in the response header (a dictionary):
response = requests.get('https://marketdata.theocc.com/position-limits?reportType=change')
print(response.headers['content-disposition'])
Output:
attachment; filename=POSITIONLIMITCHANGE_20201202.txt
Example code in Python to fetch a file from URL, extract filename, save to local file, and import into Pandas dataframe.
import io
import requests
import re
import pandas as pd
url = 'https://marketdata.theocc.com/position-limits?reportType=change'
r = requests.get(url)
# NOTE: filename is found in content-disposition HTTP response header
s = r.headers.get('content-disposition')
# use regexp with \w to match only safe characters in filename
# this will prevent accepting paths or drive letters as part of name
m = re.search(r'filename=(\w+)', s)
if m:
filename = m.group(1)
else:
# set default if filename not provided or name has bad characters
filename = "out.csv"
print("filename:", filename)
text = r.text
# if you want to write out file with filename provided
with open(filename, 'w') as fp:
fp.write(text)
# to read from string in-memory wrap with io.StringIO()
df = pd.read_csv(io.StringIO(text))
print(list(df.columns))
Output:
filename: POSITIONLIMITCHANGE_20201202.txt
['Equity_Symbol',' ','Start_Date','Start_Pos_Limit','End_Date','End_Pos_Limit','Action']

Python: generate xlsx in memory and stream file download?

for example the following code creates the xlsx file first and then streams it as a download but I'm wondering if it is possible to send the xlsx data as it is being created. For example, imagine if a very large xlsx file needs to be generated, the user has to wait until it is finished and then receive the download, what I'd like is to start the xlsx file download in the user browser, and then send over the data as it is being generated. It seems trivial with a .csv file but not so with an xlsx file.
try:
import cStringIO as StringIO
except ImportError:
import StringIO
from django.http import HttpResponse
from xlsxwriter.workbook import Workbook
def your_view(request):
# your view logic here
# create a workbook in memory
output = StringIO.StringIO()
book = Workbook(output)
sheet = book.add_worksheet('test')
sheet.write(0, 0, 'Hello, world!')
book.close()
# construct response
output.seek(0)
response = HttpResponse(output.read(), mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
response['Content-Disposition'] = "attachment; filename=test.xlsx"
return response
Are you able to write tempfiles to disk while generating the XLSX?
If you are able to use tempfile you won't be memory bound, which is nice, but the download will still only start when the XLSX writer is done assembling the document.
If you can't write tempfiles, you'll have to follow this example http://xlsxwriter.readthedocs.org/en/latest/example_http_server.html and your code is unfortunately completely memory bound.
Streaming CSV is very easy, on the other hand. Here is code we use to stream any iterator of rows in a CSV response:
import csv
import io
def csv_generator(data_generator):
csvfile = io.BytesIO()
csvwriter = csv.writer(csvfile)
def read_and_flush():
csvfile.seek(0)
data = csvfile.read()
csvfile.seek(0)
csvfile.truncate()
return data
for row in data_generator:
csvwriter.writerow(row)
yield read_and_flush()
def csv_stream_response(response, iterator, file_name="xxxx.csv"):
response.content_type = 'text/csv'
response.content_disposition = 'attachment;filename="' + file_name + '"'
response.charset = 'utf8'
response.content_encoding = 'utf8'
response.app_iter = csv_generator(iterator)
return response
xlsx format is a zip file that contains several individual files, so you can't create it on the fly and send it out as it is being created.

Downloading and unzipping a .zip file without writing to disk

I have managed to get my first python script to work which downloads a list of .ZIP files from a URL and then proceeds to extract the ZIP files and writes them to disk.
I am now at a loss to achieve the next step.
My primary goal is to download and extract the zip file and pass the contents (CSV data) via a TCP stream. I would prefer not to actually write any of the zip or extracted files to disk if I could get away with it.
Here is my current script which works but unfortunately has to write the files to disk.
import urllib, urllister
import zipfile
import urllib2
import os
import time
import pickle
# check for extraction directories existence
if not os.path.isdir('downloaded'):
os.makedirs('downloaded')
if not os.path.isdir('extracted'):
os.makedirs('extracted')
# open logfile for downloaded data and save to local variable
if os.path.isfile('downloaded.pickle'):
downloadedLog = pickle.load(open('downloaded.pickle'))
else:
downloadedLog = {'key':'value'}
# remove entries older than 5 days (to maintain speed)
# path of zip files
zipFileURL = "http://www.thewebserver.com/that/contains/a/directory/of/zip/files"
# retrieve list of URLs from the webservers
usock = urllib.urlopen(zipFileURL)
parser = urllister.URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
# only parse urls
for url in parser.urls:
if "PUBLIC_P5MIN" in url:
# download the file
downloadURL = zipFileURL + url
outputFilename = "downloaded/" + url
# check if file already exists on disk
if url in downloadedLog or os.path.isfile(outputFilename):
print "Skipping " + downloadURL
continue
print "Downloading ",downloadURL
response = urllib2.urlopen(downloadURL)
zippedData = response.read()
# save data to disk
print "Saving to ",outputFilename
output = open(outputFilename,'wb')
output.write(zippedData)
output.close()
# extract the data
zfobj = zipfile.ZipFile(outputFilename)
for name in zfobj.namelist():
uncompressed = zfobj.read(name)
# save uncompressed data to disk
outputFilename = "extracted/" + name
print "Saving extracted file to ",outputFilename
output = open(outputFilename,'wb')
output.write(uncompressed)
output.close()
# send data via tcp stream
# file successfully downloaded and extracted store into local log and filesystem log
downloadedLog[url] = time.time();
pickle.dump(downloadedLog, open('downloaded.pickle', "wb" ))
Below is a code snippet I used to fetch zipped csv file, please have a look:
Python 2:
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
resp = urlopen("http://www.test.com/file.zip")
myzip = ZipFile(StringIO(resp.read()))
for line in myzip.open(file).readlines():
print line
Python 3:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
# or: requests.get(url).content
resp = urlopen("http://www.test.com/file.zip")
myzip = ZipFile(BytesIO(resp.read()))
for line in myzip.open(file).readlines():
print(line.decode('utf-8'))
Here file is a string. To get the actual string that you want to pass, you can use zipfile.namelist(). For instance,
resp = urlopen('http://mlg.ucd.ie/files/datasets/bbc.zip')
myzip = ZipFile(BytesIO(resp.read()))
myzip.namelist()
# ['bbc.classes', 'bbc.docs', 'bbc.mtx', 'bbc.terms']
My suggestion would be to use a StringIO object. They emulate files, but reside in memory. So you could do something like this:
# get_zip_data() gets a zip archive containing 'foo.txt', reading 'hey, foo'
import zipfile
from StringIO import StringIO
zipdata = StringIO()
zipdata.write(get_zip_data())
myzipfile = zipfile.ZipFile(zipdata)
foofile = myzipfile.open('foo.txt')
print foofile.read()
# output: "hey, foo"
Or more simply (apologies to Vishal):
myzipfile = zipfile.ZipFile(StringIO(get_zip_data()))
for name in myzipfile.namelist():
[ ... ]
In Python 3 use BytesIO instead of StringIO:
import zipfile
from io import BytesIO
filebytes = BytesIO(get_zip_data())
myzipfile = zipfile.ZipFile(filebytes)
for name in myzipfile.namelist():
[ ... ]
I'd like to offer an updated Python 3 version of Vishal's excellent answer, which was using Python 2, along with some explanation of the adaptations / changes, which may have been already mentioned.
from io import BytesIO
from zipfile import ZipFile
import urllib.request
url = urllib.request.urlopen("http://www.unece.org/fileadmin/DAM/cefact/locode/loc162txt.zip")
with ZipFile(BytesIO(url.read())) as my_zip_file:
for contained_file in my_zip_file.namelist():
# with open(("unzipped_and_read_" + contained_file + ".file"), "wb") as output:
for line in my_zip_file.open(contained_file).readlines():
print(line)
# output.write(line)
Necessary changes:
There's no StringIO module in Python 3 (it's been moved to io.StringIO). Instead, I use io.BytesIO]2, because we will be handling a bytestream -- Docs, also this thread.
urlopen:
"The legacy urllib.urlopen function from Python 2.6 and earlier has been discontinued; urllib.request.urlopen() corresponds to the old urllib2.urlopen.", Docs and this thread.
Note:
In Python 3, the printed output lines will look like so: b'some text'. This is expected, as they aren't strings - remember, we're reading a bytestream. Have a look at Dan04's excellent answer.
A few minor changes I made:
I use with ... as instead of zipfile = ... according to the Docs.
The script now uses .namelist() to cycle through all the files in the zip and print their contents.
I moved the creation of the ZipFile object into the with statement, although I'm not sure if that's better.
I added (and commented out) an option to write the bytestream to file (per file in the zip), in response to NumenorForLife's comment; it adds "unzipped_and_read_" to the beginning of the filename and a ".file" extension (I prefer not to use ".txt" for files with bytestrings). The indenting of the code will, of course, need to be adjusted if you want to use it.
Need to be careful here -- because we have a byte string, we use binary mode, so "wb"; I have a feeling that writing binary opens a can of worms anyway...
I am using an example file, the UN/LOCODE text archive:
What I didn't do:
NumenorForLife asked about saving the zip to disk. I'm not sure what he meant by it -- downloading the zip file? That's a different task; see Oleh Prypin's excellent answer.
Here's a way:
import urllib.request
import shutil
with urllib.request.urlopen("http://www.unece.org/fileadmin/DAM/cefact/locode/2015-2_UNLOCODE_SecretariatNotes.pdf") as response, open("downloaded_file.pdf", 'w') as out_file:
shutil.copyfileobj(response, out_file)
I'd like to add my Python3 answer for completeness:
from io import BytesIO
from zipfile import ZipFile
import requests
def get_zip(file_url):
url = requests.get(file_url)
zipfile = ZipFile(BytesIO(url.content))
files = [zipfile.open(file_name) for file_name in zipfile.namelist()]
return files.pop() if len(files) == 1 else files
write to a temporary file which resides in RAM
it turns out the tempfile module ( http://docs.python.org/library/tempfile.html ) has just the thing:
tempfile.SpooledTemporaryFile([max_size=0[,
mode='w+b'[, bufsize=-1[, suffix=''[,
prefix='tmp'[, dir=None]]]]]])
This
function operates exactly as
TemporaryFile() does, except that data
is spooled in memory until the file
size exceeds max_size, or until the
file’s fileno() method is called, at
which point the contents are written
to disk and operation proceeds as with
TemporaryFile().
The resulting file has one additional
method, rollover(), which causes the
file to roll over to an on-disk file
regardless of its size.
The returned object is a file-like
object whose _file attribute is either
a StringIO object or a true file
object, depending on whether
rollover() has been called. This
file-like object can be used in a with
statement, just like a normal file.
New in version 2.6.
or if you're lazy and you have a tmpfs-mounted /tmp on Linux, you can just make a file there, but you have to delete it yourself and deal with naming
Adding on to the other answers using requests:
# download from web
import requests
url = 'http://mlg.ucd.ie/files/datasets/bbc.zip'
content = requests.get(url)
# unzip the content
from io import BytesIO
from zipfile import ZipFile
f = ZipFile(BytesIO(content.content))
print(f.namelist())
# outputs ['bbc.classes', 'bbc.docs', 'bbc.mtx', 'bbc.terms']
Use help(f) to get more functions details for e.g. extractall() which extracts the contents in zip file which later can be used with with open.
All of these answers appear too bulky and long. Use requests to shorten the code, e.g.:
import requests, zipfile, io
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("/path/to/directory")
Vishal's example, however great, confuses when it comes to the file name, and I do not see the merit of redefing 'zipfile'.
Here is my example that downloads a zip that contains some files, one of which is a csv file that I subsequently read into a pandas DataFrame:
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
import pandas
url = urlopen("https://www.federalreserve.gov/apps/mdrm/pdf/MDRM.zip")
zf = ZipFile(StringIO(url.read()))
for item in zf.namelist():
print("File in zip: "+ item)
# find the first matching csv file in the zip:
match = [s for s in zf.namelist() if ".csv" in s][0]
# the first line of the file contains a string - that line shall de ignored, hence skiprows
df = pandas.read_csv(zf.open(match), low_memory=False, skiprows=[0])
(Note, I use Python 2.7.13)
This is the exact solution that worked for me. I just tweaked it a little bit for Python 3 version by removing StringIO and adding IO library
Python 3 Version
from io import BytesIO
from zipfile import ZipFile
import pandas
import requests
url = "https://www.nseindia.com/content/indices/mcwb_jun19.zip"
content = requests.get(url)
zf = ZipFile(BytesIO(content.content))
for item in zf.namelist():
print("File in zip: "+ item)
# find the first matching csv file in the zip:
match = [s for s in zf.namelist() if ".csv" in s][0]
# the first line of the file contains a string - that line shall de ignored, hence skiprows
df = pandas.read_csv(zf.open(match), low_memory=False, skiprows=[0])
It wasn't obvious in Vishal's answer what the file name was supposed to be in cases where there is no file on disk. I've modified his answer to work without modification for most needs.
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
def unzip_string(zipped_string):
unzipped_string = ''
zipfile = ZipFile(StringIO(zipped_string))
for name in zipfile.namelist():
unzipped_string += zipfile.open(name).read()
return unzipped_string
Use the zipfile module. To extract a file from a URL, you'll need to wrap the result of a urlopen call in a BytesIO object. This is because the result of a web request returned by urlopen doesn't support seeking:
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
zip_url = 'http://example.com/my_file.zip'
with urlopen(zip_url) as f:
with BytesIO(f.read()) as b, ZipFile(b) as myzipfile:
foofile = myzipfile.open('foo.txt')
print(foofile.read())
If you already have the file downloaded locally, you don't need BytesIO, just open it in binary mode and pass to ZipFile directly:
from zipfile import ZipFile
zip_filename = 'my_file.zip'
with open(zip_filename, 'rb') as f:
with ZipFile(f) as myzipfile:
foofile = myzipfile.open('foo.txt')
print(foofile.read().decode('utf-8'))
Again, note that you have to open the file in binary ('rb') mode, not as text or you'll get a zipfile.BadZipFile: File is not a zip file error.
It's good practice to use all these things as context managers with the with statement, so that they'll be closed properly.

Categories

Resources