Download all pdf files from a website using Python - python

I have followed several online guides in an attempt to build a script that can identify and download all pdfs from a website to save me from doing it manually. Here is my code so far:
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
url_list.append(("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href']))
#print(url_list)
# download the pdfs to a specified location
for url in url_list:
print(url)
fullfilename = os.path.join('E:\webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", "").replace(".pdf",""))
print(fullfilename)
request.urlretrieve(url, fullfilename)
The code can appear to find all the pdfs (uncomment the print(url_list) to see this). However, it fails at the download stage. In particular I get this error and I am not able to understand what's gone wrong:
E:\webscraping>python get_pdfs.py
http://www.gatsby.ucl.ac.uk/teaching/courses/http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/cribsheet.pdf
E:\webscraping\http://www.gatsby.ucl.ac.uk/teaching/courses/cribsheet
Traceback (most recent call last):
File "get_pdfs.py", line 26, in <module>
request.urlretrieve(url, fullfilename)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
Can somebody help me please?

Check out the following implementation. I've used requests module instead of urllib to do the download. Moreover, I've used .select() method instead of .find_all() to avoid using re.
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
#If there is no such folder, the script will create one automatically
folder_location = r'E:\webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)

couple of links where already containing the server address which caused the 404 not found. Also you should not remove the .pdf from the filename as it will save it without extension.
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
if(el['href'].startswith('http')):
url_list.append(el['href'])
else:
url_list.append("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href'])
print(url_list)
# download the pdfs to a specified location
for url in url_list:
print(url)
fullfilename = os.path.join('E:\webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", ""))
print(fullfilename)
request.urlretrieve(url, fullfilename)

Generally, the answers above should work. However, you should evaluate the html source of the webpage you're trying to work with. For example, some might have the og_url property in the meta tag while others may not have it. This is possible if you're working with a secure website (let's say your university's course web-page). In this case, you will have to extract the pdf links differently.
You can find a good explanation and solution here:
https://medium.com/#dementorwriter/notesdownloader-use-web-scraping-to-download-all-pdfs-with-python-511ea9f55e48

I write a novel script based on #SIM's answer with the additional argparse. My full code is as follows:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import argparse
#%% Example
# one pdf
# python all_pdf_dl.py -l https://memento.epfl.ch/academic-calendar/ --save-here
# many pdfs
# python all_pdf_dl.py -l https://idsc.ethz.ch/education/lectures/recursive-estimation.html
#%% Functions
def all_pdf_download(args):
base_url = args.link
if args.save_here:
folder_path = os.getcwd()
else:
folder_path = args.folder_path
if not os.path.exists(args.folder_path):os.mkdir(args.folder_path)
print("====== 1. Set savepath: {} ======".format(folder_path))
print("====== 2. Start searching ======")
#response = requests.get(base_url)
response = requests.get(base_url, headers={'User-Agent': 'Custom'})
soup= BeautifulSoup(response.text, "html.parser")
search_res = soup.select("a[href$='.pdf']")
print("{} files found!!!".format(len(search_res)))
print("====== 3. Start downloading ======")
for counter, link in enumerate(search_res):
#Name the pdf files using the last portion of each link which are unique in this case
filename = link['href'].split('/')[-1]
file_save_path = os.path.join(folder_path,link['href'].split('/')[-1])
if args.print_all:
print("[{}/{}] {}".format(counter+1, len(search_res), filename))
with open(file_save_path, 'wb') as f:
f.write(requests.get(urljoin(base_url,link['href'])).content)
print("====== 4. Finished!!! ======")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Test argparse')
####################################
############ ALL OPTION ############
## Main option
# -l/--link
parser.add_argument('-l', '--link', required=True, type=str,
help='write down site name')
# --print-all
parser.add_argument('--print-all', dest='print_all', action='store_true',
help="print all filename")
parser.set_defaults(print_all=True)
# --save-here
parser.add_argument('--save-here', dest='save_here', action='store_true',
help="save files here")
parser.set_defaults(save_here=False)
# --save--folder
# default setting -> Downloads/ in user’s home directory obtained by (os.path.expanduser('~'))
parser.add_argument('-f', '--folder_path', default=r""+os.path.join(os.path.expanduser('~'), "Downloads"),
type=str, help='save files in the given folder')
########################################
############ PARSING OPTION ############
args = parser.parse_args()
all_pdf_download(args)
For more details and update, you can refer to my gist-hibetterheyj/all_pdf_dl.py
Best!

Variations to #SIM's answer for my needs:
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=Compilers&doc=docs/slides.html"
pdfPath = "http://openclassroom.stanford.edu/MainFolder/courses/Compilers/docs/"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
if(el['href'].startswith('http')):
url_list.append(el['href'])
else:
url_list.append(pdfPath + el['href'])
print(f'url_list: {url_list}\n')
# download the pdfs to a specified location
for url in url_list:
print(f'urL: {url}\n')
fullfilename = os.path.join(r'standfordPdfs', url.replace(pdfPath, ""))
print(f'fullfilename: {fullfilename}')
request.urlretrieve(url, fullfilename)

Related

Downloading pdf from URL with urllib (getting weird html output instead)

I am trying to download pdfs from several pdf urls.
An example: https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf
This url directly opens into the PDF on my browser.
However, when I use this code to download it using the link, it returns an HTML file given below.
link = "https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf"
urllib.request.urlretrieve(link, f"/content/drive/MyDrive/Research/pdfs/1.pdf")
The resulting "pdf" file or HTML code file is downloaded instead:
How do I solve this issue? Appreciate any help, thanks!
You can use BeautifulSoup or lxml to find <iframe> and get src - and use it to download file
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as BS
url = 'https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf'
response = urllib.request.urlopen(url)
soup = BS(response.read(), 'html.parser')
iframe = soup.find('iframe')
url = iframe['src']
filename = urllib.parse.unquote(url)
filename = filename.rsplit('/', 1)[-1]
urllib.request.urlretrieve(url, filename)
Eventually you can check few file to see if all use the same https://d2x0djib3vzbzj.cloudfront.net/ and simply replace it in url.
import urllib.request
import urllib.parse
url = 'https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf'
url = url.replace('https://www.fasb.org/page/showpdf?path=',
'https://d2x0djib3vzbzj.cloudfront.net/')
filename = urllib.parse.unquote(url)
filename = filename.rsplit('/', 1)[-1]
urllib.request.urlretrieve(url, filename)

How to download an HTML file completely? [duplicate]

Currently I have a script that can only download the HTML of a given page.
Now I want to download all the files of the web page including HTML, CSS, JS and image files (same as we get with a ctrl-s of any website).
My current code is:
import urllib
url = "https://en.wikipedia.org/wiki/Python_%28programming_language%29"
urllib.urlretrieve(url, "t3.html")
I visited many questions but they are all only downloading the HTML.
The following implementation enables you to get the sub-HTML websites. It can be more developed in order to get the other files you need. I sat the depth variable for you to set the maximum sub_websites that you want to parse to.
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=2)
print urls
Python3 version, 2019. May this saves some time to somebody:
#!/usr/bin/env python
import urllib.request as urllib2
from bs4 import *
from urllib.parse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print( "Could not open %s" % page)
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=1)
print( urls )
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
Using Python 3+ Requests and other standard libraries.
The function savePage receives a requests.Response and the pagefilename where to save it.
Saves the pagefilename.html on the current folder
Downloads, javascripts, css and images based on the tags script, link and img and saved on a folder pagefilename_files.
Any exception are printed on sys.stderr, returns a BeautifulSoup object .
Requests session must be a global variable unless someone writes a cleaner code here for us.
You can adapt it to your needs.
import os, sys
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def soupfindAllnSave(pagefolder, url, soup, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find): # images, css, etc..
try:
filename = os.path.basename(res[inner])
fileurl = urljoin(url, res.get(inner))
# rename to saved file path
# res[inner] # may or may not exist
filepath = os.path.join(pagefolder, filename)
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
return soup
def savePage(response, pagefilename='page'):
url = response.url
soup = BeautifulSoup(response.text)
pagefolder = pagefilename+'_files' # page contents
soup = soupfindAllnSave(pagefolder, url, soup, 'img', inner='src')
soup = soupfindAllnSave(pagefolder, url, soup, 'link', inner='href')
soup = soupfindAllnSave(pagefolder, url, soup, 'script', inner='src')
with open(pagefilename+'.html', 'w') as file:
file.write(soup.prettify())
return soup
Example saving google page and its contents (google_files folder)
session = requests.Session()
#... whatever requests config you need here
response = session.get('https://www.google.com')
savePage(response, 'google')
Try the Python library Scrapy. You can program Scrapy to recursively scan a website by downloading its pages, scanning, following links:
An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.

retrieve specific links from web page using python and BeautifulSoup

I have been trying to retrieve href link from a page and using as a variable for next href link. But I stuck at one point where I have multiple href links with the different file extension(like zip,md5 etc) and only needed to a zip extension file. here is the code I am trying to implement.
import httplib2
from BeautifulSoup import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('http://example.com')
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
if link.has_key('href'):
if '/abc' in link['href']:
basename = link['href'].split("/")[11]
print basename
status, response = http.request('http://example.com/%basename',basename)
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
if link.has_key('href'):
if '/abc' in link['href']:
basename = link['href'].split("/")[11]
print basename
try it:
import os
# YOY CODE here
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
if link.has_key('href'):
if '/abc' in link['href']:
basename = link['href'].split("/")[11]
# check file extension
filename, file_extension = os.path.splitext(basename)
print basename, file_extension
if file_extension.lower() == 'zip':
continue
# YOUR LAST CODE

Download .xls files from a webpage using Python and BeautifulSoup

I want to download all the .xls or .xlsx or .csv from this website into a specified folder.
https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009
I have looked into mechanize, beautiful soup, urllib2 etc. Mechanize does not work in Python 3, urllib2 also had problems with Python 3, I looked for workaround but I couldn't. So, I am currently trying to make it work using Beautiful Soup.
I found some example code and attempted to modify it to suit my problem, as follows -
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
url = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009/'
u = urlopen(url)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html)
for link in soup.select('div[webpartid] a'):
href = link.get('href')
if href.startswith('javascript:'):
continue
filename = href.rsplit('/', 1)[-1]
href = urljoin(url, quote(href))
try:
urlretrieve(href, filename)
except:
print('failed to download')
However, when run this code does not extract the files from the target page, nor output any failure message (e.g. 'failed to download').
How can I use BeautifulSoup to select the Excel files from the page?
How can I download these files to a local file using Python?
The issues with your script as it stand are:
The url has a trailing / which gives an invalid page when requested, not listing the files you want to download.
The CSS selector in soup.select(...) is selecting div with the attribute webpartid which does not exist anywhere in that linked document.
You are joining the URL and quoting it, even though the links are given in the page as absolute URLs and they do not need quoting.
The try:...except: block is stopping you seeing the errors generated when trying to download the file. Using an except block without a specific exception is bad practise and should be avoided.
A modified version of your code that will get the correct files and attempt to download them is as follows:
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
# Remove the trailing / you had, as that gives a 404 page
url = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009'
u = urlopen(url)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
# Select all A elements with href attributes containing URLs starting with http://
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
# Make sure it has one of the correct extensions
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = href.rsplit('/', 1)[-1]
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
However, if you run this you'll notice that a urllib.error.HTTPError: HTTP Error 403: Forbidden exception is thrown, even though the file is downloadable in the browser.
At first I thought this was a referral check (to prevent hotlinking), however if you watch at the request in your browser (e.g. Chrome Developer tools) you'll notice that
the initial http:// request is blocked there also, and then Chrome attempts a https:// request for the same file.
In other words, the request must go via HTTPS to work (despite what the URLs in the page say). To fix this you will need to rewrite the http: to https: before using the URL for the request. The following code will correctly modify the URLs and download the files. I've also added an variable to specify the output folder, which is added to the filename using os.path.join:
import os
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
URL = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009'
OUTPUT_DIR = '' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
# We need a https:// URL for this site
href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
I found this to be a good working example, using the BeautifulSoup4, requests, and wget modules for Python 2.7:
import requests
import wget
import os
from bs4 import BeautifulSoup, SoupStrainer
url = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009'
file_types = ['.xls', '.xlsx', '.csv']
for file_type in file_types:
response = requests.get(url)
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path = url + link['href']
wget.download(full_path)
i tried above code still giving me urllib.error.HTTPError: HTTP Error 403: Forbidden
Also tried by adding user agents my modified code
import os
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import Request,urlopen, urlretrieve
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
URL = Request('https://www.rbi.org.in/scripts/bs_viewcontent.aspx?Id=2009', headers=headers)
#URL = 'https://www.rbi.org.in/scripts/bs_viewcontent.aspx?Id=2009'
OUTPUT_DIR = 'E:\python\out' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
# We need a https:// URL for this site
href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
This worked best for me ... using python3
import os
import urllib
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
from urllib.error import HTTPError
URL = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009'
OUTPUT_DIR = '' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
# We need a https:// URL for this site
href = href.replace('http://','https://')
try:
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
except urllib.error.HTTPError as err:
if err.code == 404:
continue

How can i download all types of file in python with request library

I am building the crawler in python and i have the list of href from the page.
Now i have the list of file extensions to download like
list = ['zip','rar','pdf','mp3']
How can i save the files from that url to local directory using python
EDIT:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
Going by your posted example:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
So, the URL you want to fetch next is presumably list_urls[6]['href'].
The first trick is that this might be a relative URL rather than absolute. So:
newurl = list_urls[6]['href']
absurl = urlparse.urljoin(site.url, newurl)
Also, you want to only fetch the file if it has the right extension, so:
if not absurl.endswith(extensions):
return # or break or whatever
But once you've decided what URL you want to download, it's no harder than your initial fetch:
page = urllib2.urlopen(absurl)
html = page.read()
path = urlparse.urlparse(absurl).path
name = os.path.basename(path)
with open(name, 'wb') as f:
f.write(html)
That's mostly it.
There are a few things you might want to add, but if so, you have to add them all manually. For example:
Look for a Content-disposition header with a suggested filename to use in place of the URL's basename.
copyfile from page to f instead of reading the whole thing into memory and then writeing it out.
Deal with existing files with the same name.
…
But that's the basics.
You can use python requests library as you have asked in question : http://www.python-requests.org
You can save file from url like this :
import requests
url='http://i.stack.imgur.com/0LJdh.jpg'
data=requests.get(url).content
filename="image.jpg"
with open(filename, 'wb') as f:
f.write(data)
solution using urllib3
import os
import urllib3
from bs4 import BeautifulSoup
import urllib.parse
url = "https://path/site"
site = urllib3.PoolManager()
html = site.request('GET', url)
soup = BeautifulSoup(html.data, "lxml")
list_urls = soup.find_all('a')
and then a recursive function to get all the files
def recursive_function(list_urls)
newurl = list_urls[0]['href']
absurl = url+newurl
list_urls.pop(0)
if absurl.endswith(extensions): # verify if contains the targeted extensions
page = urllib3.PoolManager()
html = site.request('GET', absurl)
name = os.path.basename(absurl)
with open(name, 'wb') as f:
f.write(html.data)
return recursive_function(list_urls)

Categories

Resources