# Parse HTTP headers
headers = request.split('\n')
filename = headers[0].split()[1]
# Get the content of the file
if filename == '/':
filename = '/index.html'
# Get the content of htdocs/index.html
try:
fin = open(filename)
content = fin.read()
fin.close()
#Error, not found
except FileNotFoundError:
response = 'HTTP/1.0 404 NOT FOUND\n\nFile Not Found'
unable to get this to work below any ideas?? Unsure what exactly could be wrong here
Related
with open('URLS/Gibiru_urls.txt', 'r') as urls:
for url in urls.readlines():
url = url.rstrip("\n")
download_url(url)
I want to read urls from .txt file in one directory (Root/URLS/Gibiru_urls.txt) and output into another directory (Root/Images/Gibiru_pics). My python file is located in (Root)
def download_url(file_url):
print("downloading: ",file_url)
file_name_start_pos = file_url.rfind("/") + 1
file_name = file_url[file_name_start_pos:]
os.system("cd Images/Gibiru_pics")
r = requests.get(file_url, stream=True)
if r.status_code == requests.codes.ok:
with open(file_name, 'wb') as f:
for data in r:
f.write(data)
I was able to re-direct. It was the os.chdir() method I was looking for.
def Gibiru():
output_dir = '/multiple_image_gathering-main/Images/Gibiru_pics'
with open('URLS/Gibiru_urls.txt', 'r') as urls:
for url in urls.readlines():
url = url.rstrip("\n")
download_url(url, output_dir)
def download_url(file_url, output_dir):
os.chdir(output_dir)
def url_to_jpg(i, url, FILE_PATH):
try:
url_basename = url.split("/")[-1]
filename = '{}.jpg'.format(url_basename.rsplit( ".", 1 )[ 0 ])
full_path = '{}{}'.format(FILE_PATH, filename)
response = urllib.request.urlretrieve(url, full_path)
print('{} saved.'.format(full_path))
return None
except HTTPError as err:
print(err)
except:
e = sys.exc_info()[0]
print(e)
That's the central part of my code, without the elements and stuff, what can I do to be able to avoid that 403 error, maybe trying selenium?
def url_to_jpg(i, url, filepath):
url_basename = url.split("/")[-1]
imagename = '{}.jpg'.format(url_basename.rsplit( ".", 1 )[0])
fullpath = '{}{}'.format(filepath, imagename)
response = requests.get(url, fullpath)
file = open(imagename, "wb")
file.write(response.content)
file.close()
print('{} saved.'.format(fullpath))
return None
filename = 'libro.csv'
filepath = 'fotospython/'
urls = pd.read_csv(filename)
for i, url in enumerate(urls.values):
url_to_jpg(i, url[0], filepath)
that was my final requests using code, it works, but can't put the files in the folder, nowadays, worked
I'm trying to download an image from a website but I get a 404 error. I tried to add a user agent with no sucess.
Here is the code:
import requests
import shutil
with open(r'C:\Users\home\Desktop\urls.csv') as file:
csv = []
for row in file:
csv.append(row.split(";"))
row = 0
while row < len(csv):
r = requests.get(csv[row][0], stream=True, headers={'User-agent': 'Mozilla/5.0'})
if r.status_code == 200:
with open(r"C:\Users\home\Desktop\images\house" + str(row) + ".jpg", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
row +=1
The url is:
https://example.com/wp-content/uploads/2018/10/toronto-curbed-8.jpg
replace example by cdn.icepop
I need to download several files via http in Python.
The most obvious way to do it is just using urllib2:
import urllib2
u = urllib2.urlopen('http://server.com/file.html')
localFile = open('file.html', 'w')
localFile.write(u.read())
localFile.close()
But I'll have to deal with the URLs that are nasty in some way, say like this: http://server.com/!Run.aspx/someoddtext/somemore?id=121&m=pdf. When downloaded via the browser, the file has a human-readable name, ie. accounts.pdf.
Is there any way to handle that in python, so I don't need to know the file names and hardcode them into my script?
Download scripts like that tend to push a header telling the user-agent what to name the file:
Content-Disposition: attachment; filename="the filename.ext"
If you can grab that header, you can get the proper filename.
There's another thread that has a little bit of code to offer up for Content-Disposition-grabbing.
remotefile = urllib2.urlopen('http://example.com/somefile.zip')
remotefile.info()['Content-Disposition']
Based on comments and #Oli's anwser, I made a solution like this:
from os.path import basename
from urlparse import urlsplit
def url2name(url):
return basename(urlsplit(url)[2])
def download(url, localFileName = None):
localName = url2name(url)
req = urllib2.Request(url)
r = urllib2.urlopen(req)
if r.info().has_key('Content-Disposition'):
# If the response has Content-Disposition, we take file name from it
localName = r.info()['Content-Disposition'].split('filename=')[1]
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
elif r.url != url:
# if we were redirected, the real file name we take from the final URL
localName = url2name(r.url)
if localFileName:
# we can force to save the file as specified name
localName = localFileName
f = open(localName, 'wb')
f.write(r.read())
f.close()
It takes file name from Content-Disposition; if it's not present, uses filename from the URL (if redirection happened, the final URL is taken into account).
Combining much of the above, here is a more pythonic solution:
import urllib2
import shutil
import urlparse
import os
def download(url, fileName=None):
def getFileName(url,openUrl):
if 'Content-Disposition' in openUrl.info():
# If the response has Content-Disposition, try to get filename from it
cd = dict(map(
lambda x: x.strip().split('=') if '=' in x else (x.strip(),''),
openUrl.info()['Content-Disposition'].split(';')))
if 'filename' in cd:
filename = cd['filename'].strip("\"'")
if filename: return filename
# if no filename was found above, parse it out of the final URL.
return os.path.basename(urlparse.urlsplit(openUrl.url)[2])
r = urllib2.urlopen(urllib2.Request(url))
try:
fileName = fileName or getFileName(url,r)
with open(fileName, 'wb') as f:
shutil.copyfileobj(r,f)
finally:
r.close()
2 Kender:
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
it is not safe -- web server can pass wrong formatted name as ["file.ext] or [file.ext'] or even be empty and localName[0] will raise exception.
Correct code can looks like this:
localName = localName.replace('"', '').replace("'", "")
if localName == '':
localName = SOME_DEFAULT_FILE_NAME
Using wget:
custom_file_name = "/custom/path/custom_name.ext"
wget.download(url, custom_file_name)
Using urlretrieve:
urllib.urlretrieve(url, custom_file_name)
urlretrieve also creates the directory structure if not exists.
You need to look into 'Content-Disposition' header, see the solution by kender.
How to download a file using python in a 'smarter' way?
Posting his solution modified with a capability to specify an output folder:
from os.path import basename
import os
from urllib.parse import urlsplit
import urllib.request
def url2name(url):
return basename(urlsplit(url)[2])
def download(url, out_path):
localName = url2name(url)
req = urllib.request.Request(url)
r = urllib.request.urlopen(req)
if r.info().has_key('Content-Disposition'):
# If the response has Content-Disposition, we take file name from it
localName = r.info()['Content-Disposition'].split('filename=')[1]
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
elif r.url != url:
# if we were redirected, the real file name we take from the final URL
localName = url2name(r.url)
localName = os.path.join(out_path, localName)
f = open(localName, 'wb')
f.write(r.read())
f.close()
download("https://example.com/demofile", '/home/username/tmp')
I have just updated the answer of kender for python3
I am code a download function in python. The file size >1GB. The server is linux, HTTP server is Karrigell. Client is browse, Firefox or IE. I meet a big trouble.
At first, I use sys.stdout() to send file content.
file = open(path, 'rb')
size = os.path.getsize(path)
RESPONSE['Pragma'] = 'public'
RESPONSE['Expires'] = '0'
RESPONSE['Cache-Control'] = 'must-revalidate, pre-check=0'
RESPONSE['Content-Disposition'] = 'attachment; filename="' + os.path.basename(path) + '"'
RESPONSE['Content-type'] = "application/octet-stream"
RESPONSE['Content-Transfer-Encoding'] = 'binary'
RESPONSE['Content-length'] = str(os.path.getsize(path))
sys.stdout.flush()
chunk_size = 10000
handle = open(path, "rb")
while True:
buffer = handle.read(chunk_size)
if buffer:
STDOUT(buffer)
else:
break
sys.stdout.flush()
The problem is the server out of memory! I know, stdout write content to memory first, then memory send to socket.
So, I modify the function. Send content to socket directly. I use the py-sendfile module. http://code.google.com/p/py-sendfile/
file = open(path, 'rb')
size = os.path.getsize(path)
sock = REQUEST_HANDLER.sock
sock.sendall("""HTTP/1.1 200 OK\r\nPragma: no-cache\r\nExpires: 0\r\nCache-Control: no-cache, no-store\r\nContent-Disposition: attachment; filename="%s"\r\nContent-Type: application/octet-stream\r\nContent-Length: %u\r\nContent-Range: bytes 0-4096/%u\r\nLocation: "%s"\r\n\r\n""" % (os.path.basename(path), size, size, os.path.basename(path)))
offset = 0
nbytes = 4096
while 1:
try:
sent = sendfile.sendfile(sock.fileno(), file.fileno(), offset, nbytes)
except OSError, err:
if err.errno in (errno.EAGAIN, errno.EBUSY): # retry
continue
raise
else:
if sent == 0:
break # done
offset += sent
This time, the server memory is OK, but browse die! The browse memory rise quickly! Not free
until the socket accept whole file content.
I don't know how to deal with these problems. I think the second idea is right, send content to socket directly. But why browse can't free memory while accept data?
You should try to download the file in chunks. This is an example that works for me using urllib2
import os
import urllib2
import math
def downloadChunks(url):
"""Helper to download large files
the only arg is a url
this file will go to a temp directory
the file will also be downloaded
in chunks and print out how much remains
"""
baseFile = os.path.basename(url)
#move the file to a more uniq path
os.umask(0002)
temp_path = "/tmp/"
try:
file = os.path.join(temp_path,baseFile)
req = urllib2.urlopen(url)
total_size = int(req.info().getheader('Content-Length').strip())
downloaded = 0
CHUNK = 256 * 10240
with open(file, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
downloaded += len(chunk)
print math.floor( (downloaded / total_size) * 100 )
if not chunk: break
fp.write(chunk)
except urllib2.HTTPError, e:
print "HTTP Error:",e.code , url
return False
except urllib2.URLError, e:
print "URL Error:",e.reason , url
return False
return file