with open('URLS/Gibiru_urls.txt', 'r') as urls:
for url in urls.readlines():
url = url.rstrip("\n")
download_url(url)
I want to read urls from .txt file in one directory (Root/URLS/Gibiru_urls.txt) and output into another directory (Root/Images/Gibiru_pics). My python file is located in (Root)
def download_url(file_url):
print("downloading: ",file_url)
file_name_start_pos = file_url.rfind("/") + 1
file_name = file_url[file_name_start_pos:]
os.system("cd Images/Gibiru_pics")
r = requests.get(file_url, stream=True)
if r.status_code == requests.codes.ok:
with open(file_name, 'wb') as f:
for data in r:
f.write(data)
I was able to re-direct. It was the os.chdir() method I was looking for.
def Gibiru():
output_dir = '/multiple_image_gathering-main/Images/Gibiru_pics'
with open('URLS/Gibiru_urls.txt', 'r') as urls:
for url in urls.readlines():
url = url.rstrip("\n")
download_url(url, output_dir)
def download_url(file_url, output_dir):
os.chdir(output_dir)
Related
Good day, I need to test this function using request_mock:
def loader(link, output='os.getcwd'):
# Function loading a page from the link
response = requests.get(link)
data = response.text
file_name = modify_file_name(link)
if output == 'os.getcwd':
directory = os.getcwd()
else:
directory = output
filepath = os.path.join(directory, file_name + '.html')
with open(filepath, 'w') as page:
page.write(data)
return filepath
I didn't find a tutorial of instruction for beginners. I would be grateful for advice.
def url_to_jpg(i, url, FILE_PATH):
try:
url_basename = url.split("/")[-1]
filename = '{}.jpg'.format(url_basename.rsplit( ".", 1 )[ 0 ])
full_path = '{}{}'.format(FILE_PATH, filename)
response = urllib.request.urlretrieve(url, full_path)
print('{} saved.'.format(full_path))
return None
except HTTPError as err:
print(err)
except:
e = sys.exc_info()[0]
print(e)
That's the central part of my code, without the elements and stuff, what can I do to be able to avoid that 403 error, maybe trying selenium?
def url_to_jpg(i, url, filepath):
url_basename = url.split("/")[-1]
imagename = '{}.jpg'.format(url_basename.rsplit( ".", 1 )[0])
fullpath = '{}{}'.format(filepath, imagename)
response = requests.get(url, fullpath)
file = open(imagename, "wb")
file.write(response.content)
file.close()
print('{} saved.'.format(fullpath))
return None
filename = 'libro.csv'
filepath = 'fotospython/'
urls = pd.read_csv(filename)
for i, url in enumerate(urls.values):
url_to_jpg(i, url[0], filepath)
that was my final requests using code, it works, but can't put the files in the folder, nowadays, worked
After downloading the dependencies from nexus, I have a download path for the data to be in, but I wasn't able to open the textile its not responding, why is this so?
for item in data["items"]:
for asset in item["assets"]:
fileurl = asset["downloadUrl"]
print(fileurl)
downloadPath = '/home/centos/'
filename = downloadPath + fileurl.split('/')[-1]# '\' for Windows
outfile = open(filename, "w")
outfile.write(str(urllib.request.urlopen(fileurl).read()))
outfile.close()
if data["continuationToken"] is None:
sys.exit()
else:
#construct pagination url and loop
url = baseurl + 'components?continuationToken=' + data["continuationToken"] + '&repository=' + downloadRepository
return
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
links=[]
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
links.append(a.get('href'))
output = open(outputfile, "wb")
for i in links:
request=urllib.urlopen(i)
read=request.read()
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(outputfile,'r')
zip_ref.extractall('./data/')
zip_ref.close()
I have a url's stored in a list. I am supplying it to urllib. Each url ends with .zip extension. When I run this code I get only the last file downloaded from the list. There are about >400 links to be downloaded.
Am I missing something?
So you write all you files into one, that's not gonna work
Try this
import os
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
links=[]
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
links.append(a.get('href'))
for i in links:
request=urllib.urlopen(i)
read=request.read()
file_name = os.path.basename(i)
output = open(file_name, "wb")
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(file_name,'r')
zip_ref.extractall('./data/')
zip_ref.close()
Option 2
import os
url="someurl"
outputfile='./file.zip'
link=urllib.urlopen(url)
soup= bs4.BeautifulSoup(link,'lxml')
def download_and_extract(link):
request=urllib.urlopen(link)
read=request.read()
file_name = os.path.basename(link)
output = open(file_name, "wb")
output.write(read)
output.close()
zip_ref= zipfile.ZipFile(file_name,'r')
zip_ref.extractall('./data/')
zip_ref.close()
for data in soup.find_all('div', class_='master_content-outer-container'):
for a in data.find_all('a'):
download_and_extract(a.get('href'))
I need to download several files via http in Python.
The most obvious way to do it is just using urllib2:
import urllib2
u = urllib2.urlopen('http://server.com/file.html')
localFile = open('file.html', 'w')
localFile.write(u.read())
localFile.close()
But I'll have to deal with the URLs that are nasty in some way, say like this: http://server.com/!Run.aspx/someoddtext/somemore?id=121&m=pdf. When downloaded via the browser, the file has a human-readable name, ie. accounts.pdf.
Is there any way to handle that in python, so I don't need to know the file names and hardcode them into my script?
Download scripts like that tend to push a header telling the user-agent what to name the file:
Content-Disposition: attachment; filename="the filename.ext"
If you can grab that header, you can get the proper filename.
There's another thread that has a little bit of code to offer up for Content-Disposition-grabbing.
remotefile = urllib2.urlopen('http://example.com/somefile.zip')
remotefile.info()['Content-Disposition']
Based on comments and #Oli's anwser, I made a solution like this:
from os.path import basename
from urlparse import urlsplit
def url2name(url):
return basename(urlsplit(url)[2])
def download(url, localFileName = None):
localName = url2name(url)
req = urllib2.Request(url)
r = urllib2.urlopen(req)
if r.info().has_key('Content-Disposition'):
# If the response has Content-Disposition, we take file name from it
localName = r.info()['Content-Disposition'].split('filename=')[1]
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
elif r.url != url:
# if we were redirected, the real file name we take from the final URL
localName = url2name(r.url)
if localFileName:
# we can force to save the file as specified name
localName = localFileName
f = open(localName, 'wb')
f.write(r.read())
f.close()
It takes file name from Content-Disposition; if it's not present, uses filename from the URL (if redirection happened, the final URL is taken into account).
Combining much of the above, here is a more pythonic solution:
import urllib2
import shutil
import urlparse
import os
def download(url, fileName=None):
def getFileName(url,openUrl):
if 'Content-Disposition' in openUrl.info():
# If the response has Content-Disposition, try to get filename from it
cd = dict(map(
lambda x: x.strip().split('=') if '=' in x else (x.strip(),''),
openUrl.info()['Content-Disposition'].split(';')))
if 'filename' in cd:
filename = cd['filename'].strip("\"'")
if filename: return filename
# if no filename was found above, parse it out of the final URL.
return os.path.basename(urlparse.urlsplit(openUrl.url)[2])
r = urllib2.urlopen(urllib2.Request(url))
try:
fileName = fileName or getFileName(url,r)
with open(fileName, 'wb') as f:
shutil.copyfileobj(r,f)
finally:
r.close()
2 Kender:
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
it is not safe -- web server can pass wrong formatted name as ["file.ext] or [file.ext'] or even be empty and localName[0] will raise exception.
Correct code can looks like this:
localName = localName.replace('"', '').replace("'", "")
if localName == '':
localName = SOME_DEFAULT_FILE_NAME
Using wget:
custom_file_name = "/custom/path/custom_name.ext"
wget.download(url, custom_file_name)
Using urlretrieve:
urllib.urlretrieve(url, custom_file_name)
urlretrieve also creates the directory structure if not exists.
You need to look into 'Content-Disposition' header, see the solution by kender.
How to download a file using python in a 'smarter' way?
Posting his solution modified with a capability to specify an output folder:
from os.path import basename
import os
from urllib.parse import urlsplit
import urllib.request
def url2name(url):
return basename(urlsplit(url)[2])
def download(url, out_path):
localName = url2name(url)
req = urllib.request.Request(url)
r = urllib.request.urlopen(req)
if r.info().has_key('Content-Disposition'):
# If the response has Content-Disposition, we take file name from it
localName = r.info()['Content-Disposition'].split('filename=')[1]
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
elif r.url != url:
# if we were redirected, the real file name we take from the final URL
localName = url2name(r.url)
localName = os.path.join(out_path, localName)
f = open(localName, 'wb')
f.write(r.read())
f.close()
download("https://example.com/demofile", '/home/username/tmp')
I have just updated the answer of kender for python3