How to select and download only specific PDF from a website? - python

I found some code online that allows you to download all the PDF found from a url and it works, but it fails on the website I need it for. Im trying to download the PDF of the menu for each day of the week and I can't seem to figure out how to narrow it down to only those 7 pdf files.
from bs4 import BeautifulSoup
import requests
url = "https://calbaptist.edu/dining/alumni-dining-commons"
# Requests URL and get response object
response = requests.get(url)
# Parse text obtained
soup = BeautifulSoup(response.text, 'html.parser')
# Find all hyperlinks present on webpage
links = soup.find_all('a')
i = 0
# From all links check for pdf link and
# if present download file
for link in links:
if (".pdf" in link.get('href', [])):
i += 1
print("Downloading file: ", i)
# Get response object for link
response = requests.get(link.get('href'))
# Write content in pdf file
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(response.content)
pdf.close()
print("File ", i, " downloaded")
print("All PDF files downloaded")
I tried to change the if-statement to instead of looking for .pdf to look for /dining/menus-and-hours/adc-menus/. This gave me an error on the line that gets the responce object for the link.

Check the href values, they are relative and not absolute, so you have to prepend the "base url".
You could also select your elements more specific with css selector like contains something:
soup.select('a[href*="/dining/menus-and-hours/adc-menus/"]')
or ends with .pdf
soup.select('a[href$=".pdf"]')
May also take a look at enumerat():
for i,e in enumerate(soup.select('a[href*="/dining/menus-and-hours/adc-menus/"]'),start=1):
Checking content type of reponse header:
requests.get('https://calbaptist.edu'+e.get('href')).headers['Content-Type']
Example
from bs4 import BeautifulSoup
import requests
url = "https://calbaptist.edu/dining/alumni-dining-commons"
soup = BeautifulSoup(requests.get(url).text)
for i,e in enumerate(soup.select('a[href*="/dining/menus-and-hours/adc-menus/"]'),start=1):
r = requests.get('https://calbaptist.edu'+e.get('href'))
if r.headers['Content-Type'] == 'application/pdf':
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(r.content)
pdf.close()
print("File ", i, " downloaded")

Related

How to download all pdf files from multiple urls python

Using Python, I'd like to download all pdf files(except names that begin by "INS") from website
url_asn="https://www.asn.fr/recherche?filter_year[from]={}&filter_year[to]={}&limit=50&search_content_type=&search_text={}&sort_type=date&page={}"
if link['href'] is not pdf, then open it and download pdf files if they exist - for each page, interate to last page.
probably this will work?
I have added comments for every line.
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = " " # url to scrape
#If there is no such folder, the script will create one automatically
folder_location = r'/webscraping' # folder location
# create folder if it doesn't exist
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url) # get the html
soup= BeautifulSoup(response.text, "html.parser") # parse the html
for link in soup.select("a[href$='.pdf']"): # select all the pdf links
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1]) # join the folder location and the filename
with open(filename, 'wb') as f:
# open the file and write the pdf
f.write(requests.get(urljoin(url,link['href'])).content)

How to download an HTML file completely? [duplicate]

Currently I have a script that can only download the HTML of a given page.
Now I want to download all the files of the web page including HTML, CSS, JS and image files (same as we get with a ctrl-s of any website).
My current code is:
import urllib
url = "https://en.wikipedia.org/wiki/Python_%28programming_language%29"
urllib.urlretrieve(url, "t3.html")
I visited many questions but they are all only downloading the HTML.
The following implementation enables you to get the sub-HTML websites. It can be more developed in order to get the other files you need. I sat the depth variable for you to set the maximum sub_websites that you want to parse to.
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=2)
print urls
Python3 version, 2019. May this saves some time to somebody:
#!/usr/bin/env python
import urllib.request as urllib2
from bs4 import *
from urllib.parse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print( "Could not open %s" % page)
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=1)
print( urls )
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
Using Python 3+ Requests and other standard libraries.
The function savePage receives a requests.Response and the pagefilename where to save it.
Saves the pagefilename.html on the current folder
Downloads, javascripts, css and images based on the tags script, link and img and saved on a folder pagefilename_files.
Any exception are printed on sys.stderr, returns a BeautifulSoup object .
Requests session must be a global variable unless someone writes a cleaner code here for us.
You can adapt it to your needs.
import os, sys
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def soupfindAllnSave(pagefolder, url, soup, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find): # images, css, etc..
try:
filename = os.path.basename(res[inner])
fileurl = urljoin(url, res.get(inner))
# rename to saved file path
# res[inner] # may or may not exist
filepath = os.path.join(pagefolder, filename)
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
return soup
def savePage(response, pagefilename='page'):
url = response.url
soup = BeautifulSoup(response.text)
pagefolder = pagefilename+'_files' # page contents
soup = soupfindAllnSave(pagefolder, url, soup, 'img', inner='src')
soup = soupfindAllnSave(pagefolder, url, soup, 'link', inner='href')
soup = soupfindAllnSave(pagefolder, url, soup, 'script', inner='src')
with open(pagefilename+'.html', 'w') as file:
file.write(soup.prettify())
return soup
Example saving google page and its contents (google_files folder)
session = requests.Session()
#... whatever requests config you need here
response = session.get('https://www.google.com')
savePage(response, 'google')
Try the Python library Scrapy. You can program Scrapy to recursively scan a website by downloading its pages, scanning, following links:
An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.

Crawling csv files from a url with dropdown list?

I am trying to crawl monthly data (csv files) from Weather Canada.
Normally one needs to select the year/month/day from the dropdown list and click on the "GO" and then click the "Download Data" button for that data of the selected month + year, as below.
I'd like to download all data files in CSV from all available month/year in python (with beautifulsoup 4).
I tried to modify some codes from another question here, but hasn't been successful. Please help.
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
# Removed the trailing / from the URL
urlJan2020 =
'''https://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-09-24%7C2020-03-03&dlyRange=2018-05-14%7C2020-03-03&mlyRange=%7C&StationID=43403&Prov=NS&urlExtension=_e.html&searchType=stnProx&optLimit=yearRange&StartYear=1840&EndYear=2020&selRowPerPage=25&Line=0&txtRadius=50&optProxType=city&selCity=44%7C40%7C63%7C36%7CHalifax&selPark=&txtCentralLatDeg=&txtCentralLatMin=0&txtCentralLatSec=0&txtCentralLongDeg=&txtCentralLongMin=0&txtCentralLongSec=0&txtLatDecDeg=&txtLongDecDeg=&timeframe=1&Year=2020&Month=1&Day=1#'''
u = urlopen(urlJan2020)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
# Select all A elements that have an href attribute, starting with http://
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = href.rsplit('/', 1)[-1]
# You don't need to join + quote as URLs in the HTML are absolute.
# However, we need a https:// URL (in spite of what the link says: check request in your web browser's developer tools)
href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
from bs4 import BeautifulSoup
import requests
def Main():
with requests.Session() as req:
for year in range(2019, 2021):
for month in range(1, 13):
r = req.post(
f"https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID=43403&Year={year}&Month={month}&Day=1&timeframe=1&submit=Download+Data")
name = r.headers.get(
"Content-Disposition").split("_", 5)[-1][:-1]
with open(name, 'w') as f:
f.write(r.text)
print(f"Saved {name}")
Main()

Issue downloading multiple PDFs

After running the following code, I am unable to open the downloaded PDF's. Even though the code ran successfully, the downloaded PDF files are damaged.
My computer's error message is
Unable to open file. it may be damaged or in a format Preview doesn't recognize.
Why are they damaged and how do I solve this?
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://github.com/sonhuytran/MIT8.01SC.2010F/tree/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual"
#If there is no such folder, the script will create one automatically
folder_location = r'/Users/rahelmizrahi/Desktop/ Physics_Solutions'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
This issue is you are requesting the link that is within github 'blob' when you need the the 'raw' link:
'/sonhuytran/MIT8.01SC.2010F/blob/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf'
but you want:
'/sonhuytran/MIT8.01SC.2010F/raw/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf'
So just adjust that. Full code below:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://github.com/sonhuytran/MIT8.01SC.2010F/tree/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual"
#If there is no such folder, the script will create one automatically
folder_location = r'/Users/rahelmizrahi/Desktop/Physics_Solutions'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
pdf_link = link['href'].replace('blob','raw')
pdf_file = requests.get('https://github.com' + pdf_link)
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(pdf_file.content)
I had to use soup.select("a[href$=.pdf]") (without the inner quotes) to get it to select the links correctly.
After that, your script works, but: what you're downloading is not a PDF, but an HTML webpage! Try visiting one of the URLs: https://github.com/sonhuytran/MIT8.01SC.2010F/blob/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf
You'll be presented with a GitHub webpage, not the actual PDF. To get that, you need the "raw" GitHub URL, which you can see when you hover over the Download button: https://github.com/sonhuytran/MIT8.01SC.2010F/raw/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf
So, it looks like you just have to replace blob with raw at the proper spot to make it work:
href = link['href']
href = href.replace('/blob/', '/raw/')
requests.get(urljoin(url,href).content)
The issue is that the file is not properly closed after the open/write.
Just add f.close() at the end of the code to do that.

Web scraping web crawling a pdf document with url that changes on the website with Python

import os
import requests
from bs4 import BeautifulSoup
desktop = os.path.expanduser("~/Desktop")
url = 'https://www.ici.org/research/stats'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
excel_files = soup.select('a[href*=xls]')
for each in excel_files:
if 'Supplement: Worldwide Public Tables' in each.text:
link = 'https://www.ici.org' + each['href']
filename = each['href'].split('/')[-1]
if os.path.isfile(desktop + '/' + filename):
print ('*** File already exists: %s ***' %filename)
continue
resp = requests.get(link)
output = open(desktop + '/' + filename, 'wb')
output.write(resp.content)
output.close()
print ('Saved: %s' %filename)
I am new to web scraping and I want to automatically download from a list of websites a pdf document.
This document is updated on a monthly basis and the url changes on the website.
e.g https://fundcentres.lgim.com/fund-centre/OEIC/Sterling-Liquidity-Fund
I want to download the 'factsheet' pdf document from the above website.
I think the ideal way would be the code to press the factsheet and saves it to a location on the drive. The difficulty is that the url changes!

Categories

Resources