Crawling csv files from a url with dropdown list? - python

I am trying to crawl monthly data (csv files) from Weather Canada.
Normally one needs to select the year/month/day from the dropdown list and click on the "GO" and then click the "Download Data" button for that data of the selected month + year, as below.
I'd like to download all data files in CSV from all available month/year in python (with beautifulsoup 4).
I tried to modify some codes from another question here, but hasn't been successful. Please help.
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
# Removed the trailing / from the URL
urlJan2020 =
'''https://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-09-24%7C2020-03-03&dlyRange=2018-05-14%7C2020-03-03&mlyRange=%7C&StationID=43403&Prov=NS&urlExtension=_e.html&searchType=stnProx&optLimit=yearRange&StartYear=1840&EndYear=2020&selRowPerPage=25&Line=0&txtRadius=50&optProxType=city&selCity=44%7C40%7C63%7C36%7CHalifax&selPark=&txtCentralLatDeg=&txtCentralLatMin=0&txtCentralLatSec=0&txtCentralLongDeg=&txtCentralLongMin=0&txtCentralLongSec=0&txtLatDecDeg=&txtLongDecDeg=&timeframe=1&Year=2020&Month=1&Day=1#'''
u = urlopen(urlJan2020)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
# Select all A elements that have an href attribute, starting with http://
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = href.rsplit('/', 1)[-1]
# You don't need to join + quote as URLs in the HTML are absolute.
# However, we need a https:// URL (in spite of what the link says: check request in your web browser's developer tools)
href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")

from bs4 import BeautifulSoup
import requests
def Main():
with requests.Session() as req:
for year in range(2019, 2021):
for month in range(1, 13):
r = req.post(
f"https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID=43403&Year={year}&Month={month}&Day=1&timeframe=1&submit=Download+Data")
name = r.headers.get(
"Content-Disposition").split("_", 5)[-1][:-1]
with open(name, 'w') as f:
f.write(r.text)
print(f"Saved {name}")
Main()

Related

How to select and download only specific PDF from a website?

I found some code online that allows you to download all the PDF found from a url and it works, but it fails on the website I need it for. Im trying to download the PDF of the menu for each day of the week and I can't seem to figure out how to narrow it down to only those 7 pdf files.
from bs4 import BeautifulSoup
import requests
url = "https://calbaptist.edu/dining/alumni-dining-commons"
# Requests URL and get response object
response = requests.get(url)
# Parse text obtained
soup = BeautifulSoup(response.text, 'html.parser')
# Find all hyperlinks present on webpage
links = soup.find_all('a')
i = 0
# From all links check for pdf link and
# if present download file
for link in links:
if (".pdf" in link.get('href', [])):
i += 1
print("Downloading file: ", i)
# Get response object for link
response = requests.get(link.get('href'))
# Write content in pdf file
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(response.content)
pdf.close()
print("File ", i, " downloaded")
print("All PDF files downloaded")
I tried to change the if-statement to instead of looking for .pdf to look for /dining/menus-and-hours/adc-menus/. This gave me an error on the line that gets the responce object for the link.
Check the href values, they are relative and not absolute, so you have to prepend the "base url".
You could also select your elements more specific with css selector like contains something:
soup.select('a[href*="/dining/menus-and-hours/adc-menus/"]')
or ends with .pdf
soup.select('a[href$=".pdf"]')
May also take a look at enumerat():
for i,e in enumerate(soup.select('a[href*="/dining/menus-and-hours/adc-menus/"]'),start=1):
Checking content type of reponse header:
requests.get('https://calbaptist.edu'+e.get('href')).headers['Content-Type']
Example
from bs4 import BeautifulSoup
import requests
url = "https://calbaptist.edu/dining/alumni-dining-commons"
soup = BeautifulSoup(requests.get(url).text)
for i,e in enumerate(soup.select('a[href*="/dining/menus-and-hours/adc-menus/"]'),start=1):
r = requests.get('https://calbaptist.edu'+e.get('href'))
if r.headers['Content-Type'] == 'application/pdf':
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(r.content)
pdf.close()
print("File ", i, " downloaded")

How to download an HTML file completely? [duplicate]

Currently I have a script that can only download the HTML of a given page.
Now I want to download all the files of the web page including HTML, CSS, JS and image files (same as we get with a ctrl-s of any website).
My current code is:
import urllib
url = "https://en.wikipedia.org/wiki/Python_%28programming_language%29"
urllib.urlretrieve(url, "t3.html")
I visited many questions but they are all only downloading the HTML.
The following implementation enables you to get the sub-HTML websites. It can be more developed in order to get the other files you need. I sat the depth variable for you to set the maximum sub_websites that you want to parse to.
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=2)
print urls
Python3 version, 2019. May this saves some time to somebody:
#!/usr/bin/env python
import urllib.request as urllib2
from bs4 import *
from urllib.parse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print( "Could not open %s" % page)
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=1)
print( urls )
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
Using Python 3+ Requests and other standard libraries.
The function savePage receives a requests.Response and the pagefilename where to save it.
Saves the pagefilename.html on the current folder
Downloads, javascripts, css and images based on the tags script, link and img and saved on a folder pagefilename_files.
Any exception are printed on sys.stderr, returns a BeautifulSoup object .
Requests session must be a global variable unless someone writes a cleaner code here for us.
You can adapt it to your needs.
import os, sys
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def soupfindAllnSave(pagefolder, url, soup, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find): # images, css, etc..
try:
filename = os.path.basename(res[inner])
fileurl = urljoin(url, res.get(inner))
# rename to saved file path
# res[inner] # may or may not exist
filepath = os.path.join(pagefolder, filename)
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
return soup
def savePage(response, pagefilename='page'):
url = response.url
soup = BeautifulSoup(response.text)
pagefolder = pagefilename+'_files' # page contents
soup = soupfindAllnSave(pagefolder, url, soup, 'img', inner='src')
soup = soupfindAllnSave(pagefolder, url, soup, 'link', inner='href')
soup = soupfindAllnSave(pagefolder, url, soup, 'script', inner='src')
with open(pagefilename+'.html', 'w') as file:
file.write(soup.prettify())
return soup
Example saving google page and its contents (google_files folder)
session = requests.Session()
#... whatever requests config you need here
response = session.get('https://www.google.com')
savePage(response, 'google')
Try the Python library Scrapy. You can program Scrapy to recursively scan a website by downloading its pages, scanning, following links:
An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.

Web scraping web crawling a pdf document with url that changes on the website with Python

import os
import requests
from bs4 import BeautifulSoup
desktop = os.path.expanduser("~/Desktop")
url = 'https://www.ici.org/research/stats'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
excel_files = soup.select('a[href*=xls]')
for each in excel_files:
if 'Supplement: Worldwide Public Tables' in each.text:
link = 'https://www.ici.org' + each['href']
filename = each['href'].split('/')[-1]
if os.path.isfile(desktop + '/' + filename):
print ('*** File already exists: %s ***' %filename)
continue
resp = requests.get(link)
output = open(desktop + '/' + filename, 'wb')
output.write(resp.content)
output.close()
print ('Saved: %s' %filename)
I am new to web scraping and I want to automatically download from a list of websites a pdf document.
This document is updated on a monthly basis and the url changes on the website.
e.g https://fundcentres.lgim.com/fund-centre/OEIC/Sterling-Liquidity-Fund
I want to download the 'factsheet' pdf document from the above website.
I think the ideal way would be the code to press the factsheet and saves it to a location on the drive. The difficulty is that the url changes!

Is there a better, simpler way to download multiple files?

I went on the NYC MTA website to download some turnstile data and came up with a script to download only 2017 data on Python.
Here is the script:
import urllib
import re
html = urllib.urlopen('http://web.mta.info/developers/turnstile.html').read()
links = re.findall('href="(data/\S*17[01]\S*[a-z])"', html)
for link in links:
txting = urllib.urlopen('http://web.mta.info/developers/'+link).read()
lin = link[20:40]
fhand = open(lin,'w')
fhand.write(txting)
fhand.close()
Is there a simpler way to write this script?
As suggested by #dizzyf, you can use BeautifulSoup to get the href values from the web page.
from BS4 import BeautifulSoup
soup = BeautifulSoup(html)
links = [link.get('href') for link in soup.find_all('a')
if 'turnstile_17' in link.get('href')]
If you don't have to do get the files in Python, (and you're on a system with the wget command), you can write the links to a file:
with open('url_list.txt','w') as url_file:
for url in links:
url_file.writeline(url)
Then download them with wget:
$ wget -i url_list.txt
wget -i downloads all the URLs from the file into the current directory, preserving the filenames.
The code below should do what you need.
import requests
import bs4
import time
import random
import re
pattern = '2017'
url_base = 'http://web.mta.info/developers/'
url_home = url_base + 'turnstile.html'
response = requests.get(url_home)
data = dict()
soup = bs4.BeautifulSoup(response.text)
links = [link.get('href') for link in soup.find_all('a',
text=re.compile('2017'))]
for link in links:
url = url_base + link
print "Pulling data from:", url
response = requests.get(url)
data[link] = response.text # I don't know what you want to do with the data so here I just store it to a dict, but you could store it to a file as you did in your example.
not_a_robot = random.randint(2, 15)
print "Waiting %d seconds before next query." % not_a_robot
time.sleep(not_a_robot) # some APIs will throttle you if you hit them too quickly

Download .xls files from a webpage using Python and BeautifulSoup

I want to download all the .xls or .xlsx or .csv from this website into a specified folder.
https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009
I have looked into mechanize, beautiful soup, urllib2 etc. Mechanize does not work in Python 3, urllib2 also had problems with Python 3, I looked for workaround but I couldn't. So, I am currently trying to make it work using Beautiful Soup.
I found some example code and attempted to modify it to suit my problem, as follows -
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
url = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009/'
u = urlopen(url)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html)
for link in soup.select('div[webpartid] a'):
href = link.get('href')
if href.startswith('javascript:'):
continue
filename = href.rsplit('/', 1)[-1]
href = urljoin(url, quote(href))
try:
urlretrieve(href, filename)
except:
print('failed to download')
However, when run this code does not extract the files from the target page, nor output any failure message (e.g. 'failed to download').
How can I use BeautifulSoup to select the Excel files from the page?
How can I download these files to a local file using Python?
The issues with your script as it stand are:
The url has a trailing / which gives an invalid page when requested, not listing the files you want to download.
The CSS selector in soup.select(...) is selecting div with the attribute webpartid which does not exist anywhere in that linked document.
You are joining the URL and quoting it, even though the links are given in the page as absolute URLs and they do not need quoting.
The try:...except: block is stopping you seeing the errors generated when trying to download the file. Using an except block without a specific exception is bad practise and should be avoided.
A modified version of your code that will get the correct files and attempt to download them is as follows:
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
# Remove the trailing / you had, as that gives a 404 page
url = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009'
u = urlopen(url)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
# Select all A elements with href attributes containing URLs starting with http://
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
# Make sure it has one of the correct extensions
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = href.rsplit('/', 1)[-1]
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
However, if you run this you'll notice that a urllib.error.HTTPError: HTTP Error 403: Forbidden exception is thrown, even though the file is downloadable in the browser.
At first I thought this was a referral check (to prevent hotlinking), however if you watch at the request in your browser (e.g. Chrome Developer tools) you'll notice that
the initial http:// request is blocked there also, and then Chrome attempts a https:// request for the same file.
In other words, the request must go via HTTPS to work (despite what the URLs in the page say). To fix this you will need to rewrite the http: to https: before using the URL for the request. The following code will correctly modify the URLs and download the files. I've also added an variable to specify the output folder, which is added to the filename using os.path.join:
import os
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
URL = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009'
OUTPUT_DIR = '' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
# We need a https:// URL for this site
href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
I found this to be a good working example, using the BeautifulSoup4, requests, and wget modules for Python 2.7:
import requests
import wget
import os
from bs4 import BeautifulSoup, SoupStrainer
url = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009'
file_types = ['.xls', '.xlsx', '.csv']
for file_type in file_types:
response = requests.get(url)
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path = url + link['href']
wget.download(full_path)
i tried above code still giving me urllib.error.HTTPError: HTTP Error 403: Forbidden
Also tried by adding user agents my modified code
import os
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import Request,urlopen, urlretrieve
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
URL = Request('https://www.rbi.org.in/scripts/bs_viewcontent.aspx?Id=2009', headers=headers)
#URL = 'https://www.rbi.org.in/scripts/bs_viewcontent.aspx?Id=2009'
OUTPUT_DIR = 'E:\python\out' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
# We need a https:// URL for this site
href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
This worked best for me ... using python3
import os
import urllib
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
from urllib.error import HTTPError
URL = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009'
OUTPUT_DIR = '' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
# We need a https:// URL for this site
href = href.replace('http://','https://')
try:
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
except urllib.error.HTTPError as err:
if err.code == 404:
continue

Categories

Resources