How can I download multiple PDF files with Python? - python

I am trying to download the publications on every page of https://occ.ca/our-publications
My end goal is to parse through the text in the PDF files and locate certain keywords.
Thus far, I have been able to scrape the links to the PDF files on all the pages. I have saved these links into a list. Now, I want to go through the list and download all the pdf files with Python. Once the files have been downloads, I want to parse through them.
This is the code that I have used thus far:
import requests
from bs4 import BeautifulSoup
import lxml
import csv
# This code adds all PDF links into a list called
#"publications".
publications=[]
for i in range(19):
response=requests.get('https://occ.ca/our-
publications/page/{}/'.format(i), headers={'User-
Agent': 'Mozilla'})
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
pdfs = soup.findAll('div', {"class":
"publicationoverlay"})
links = [pdf.find('a').attrs['href'] for pdf in pdfs]
publications.append(links)
Next, I want to go through that list and download the PDF files.
import urllib.request
for x in publications:
urllib.request.urlretrieve(x,'Publication_{}'.format(range(213)))
This is the error I get when I run the code.
This is the error I get
Traceback (most recent call last):
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\m.py", line 23, in
urllib.request.urlretrieve(x,'Publication_ {}.pdf'.format(range(213)))
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 247, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden

pls try:
import requests
from bs4 import BeautifulSoup
import lxml
import csv
# This code adds all PDF links into a list called
#"publications".
publications=[]
for i in range(19):
response=requests.get('https://occ.ca/our-
publications/page/{}/'.format(i), headers={'User-
Agent': 'Mozilla'})
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
pdfs = soup.findAll('div', {"class":
"publicationoverlay"})
links = [pdf.find('a').attrs['href'] for pdf in pdfs]
publications.extend(links)
for cntr, link in enumerate(publications):
print("try to get link", link)
rslt = requests.get(link)
print("Got", rslt)
fname = "temporarypdf_%d.pdf" % cntr
with open("temporarypdf_%d.pdf" % cntr, "wb") as fout:
fout.write(rslt.raw.read())
print("saved pdf data into ", fname)
# Call here the code that reads and parses the pdf.

Could you please inform also the line number where the error occours ?

Related

Error in web-Scraping Code Using BeautifulSoup

I want to get data from https://www.cvedetails.com/vulnerability-list/vendor_id-26/product_id-32238/Microsoft-Windows-10.html
from page 1 to the last page while it is sorted by "CVE Number Ascending"
The data I want to retrieve in CSV format is everything in the table header
and the table data
I have been trying out a few codes
but it doesn't seem to work
and I'm kind of desperate now
https://youtu.be/XQgXKtPSzUI
the place I try to learn from
Any help would be appreciated
I asked this once before
The replies I got were great
But it doesn't seem to get what I need and I am confused about how this works
And more so because of how weird the sources code for the website is
#!/usr/bin/env python3
import bs4 # Good HTML parser
from urllib.request import urlopen as uReq # Helps with opening URL
from bs4 import BeautifulSoup as soup
# The target URL
my_url = 'https://www.cvedetails.com/vulnerability-list.php?vendor_id=26&product_id=32238&version_id=&page=1&hasexp=0&opdos=0&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year=0&month=0&cweid=0&order=2&trc=851&sha=41e451b72c2e412c0a1cb8cb1dcfee3d16d51c44'
# Check process
# print(my_url)
# Open a connection and grab the webpage and downloads it
uClient = uReq(my_url)
# Save the webpage into a variable
page_html = uClient.read()
# Close the internet connection from uclient
uClient.close()
# Calling soup to parse the html with html parser and saving it to a variable
page_soup = soup(page_html,"html.parser")
print(page_soup.h1)
This is the error code
Traceback (most recent call last):
File "./Testing3.py", line 21, in <module>
uClient = uReq(my_url)
File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/usr/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.6/urllib/request.py", line 570, in error
return self._call_chain(*args)
File "/usr/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/usr/lib/python3.6/urllib/request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
To avoid this error, you need to supply the user agent through the header in the request.
Try modifying your script as:
#!/usr/bin/env python3
import bs4
from urllib.request import urlopen as uReq, Request
from bs4 import BeautifulSoup as soup
#bs4 is a good html parser
#urllib.request helps with opening the url
#setting the target url
my_url = 'https://www.cvedetails.com/vulnerability-list.php?vendor_id=26&product_id=32238&version_id=&page=1&hasexp=0&opdos=0&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year=0&month=0&cweid=0&order=2&trc=851&sha=41e451b72c2e412c0a1cb8cb1dcfee3d16d51c44'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(my_url,headers=hdr)
page = uReq(req)
page_soup = soup(page)
print(page_soup.h1)
Instead of urllib why don't you directly use requests module. Try this code
import requests
from bs4 import BeautifulSoup as soup
my_url = 'https://www.cvedetails.com/vulnerability-list.php?vendor_id=26&product_id=32238&version_id=&page=1&hasexp=0&opdos=0&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year=0&month=0&cweid=0&order=2&trc=851&sha=41e451b72c2e412c0a1cb8cb1dcfee3d16d51c44'
page_html = requests.get(my_url).text
page_soup = soup(page_html,"html.parser")
print(page_soup.h1)
output:
<h1>
Microsoft ยป Windows 10 : Security Vulnerabilities
</h1>

passing a variable to urlopen() and reading it again in python using bs4

I am planning to open a bunch of links where the only thing changing is the year at the end of the links. I am using the code below but it is returning a bunch of errors. My aim is to open that link and filter some things on the page but first I need to open all the pages so I have the test code. Code below:
from xlwt import *
from urllib.request import urlopen
from bs4 import BeautifulSoup, SoupStrainer
from xlwt.Style import *
j=2014
for j in range(2015):
conv=str(j)
content = urlopen("http://en.wikipedia.org/wiki/List_of_Telugu_films_of_%s").read() %conv
j+=1
print(content)
Errors:
Traceback (most recent call last):
File "F:\urltest.py", line 11, in <module>
content = urlopen("http://en.wikipedia.org/wiki/List_of_Telugu_films_of_%s").read() %conv
File "C:\Python34\lib\urllib\request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "C:\Python34\lib\urllib\request.py", line 469, in open
response = meth(req, response)
File "C:\Python34\lib\urllib\request.py", line 579, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python34\lib\urllib\request.py", line 507, in error
return self._call_chain(*args)
File "C:\Python34\lib\urllib\request.py", line 441, in _call_chain
result = func(*args)
File "C:\Python34\lib\urllib\request.py", line 587, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 400: Bad Request
A little guidance required. If there is any other way to pass the variables[2014, 2015 etc] also it would be great.
That may be because you are declaring j and then modifying it at the end of your loop. range() already does this for you so you don't have to increment it. Also, your string interpolation syntax looks wrong. Be sure to include the variable immediately after the string. print("Hi %s!" % name).
Try:
for j in range(2015):
conv=str(j)
content = urlopen("http://en.wikipedia.org/wiki/List_of_Telugu_films_of_%s" % conv).read()
Also, I am assuming you don't want to query from years 0 to 2015. You can call range(start_year, end_year) to iterate from [start_year, end_year).
As cesar pointed out in his answer, incrementing j is not needed since you are already looping with it. Also, j=0 in the beginning doesn't have any effect because your loop starts from 0 anyway.
This will create a dictionary called contents where each key is referring to the page of the corresponding year:
import urllib2
url = "http://en.wikipedia.org/wiki/List_of_Telugu_films_of_%d"
contents = {year:urllib2.urlopen(url % year).read()
for year in range(2014,2015+1)}
However, if you have multiple pages to load, I think the best way would be to save each file to your local disk first and then load from there for further processing.
This would be because you probably want to go back to your parsing process multiple times but want to download the files only once. So consider doing something like:
#reading, (only once)
for year in range(start_year,end_year+1):
with open('year_%d.txt' % year,'w') as f:
f.write(urllib2.urlopen(url % year).read())
#processing
for year in range(start_year,end_year+1):
with open('year_%d.txt','r') as f:
page = f.read()
process(page)

Script to download stock price data from yahoo finance, randomly 404s

the following script reads a .txt of ticker symbols of companies to download the corresponding financial informations in a .csv . The data is pulled from Yahoo Finance and saved in a local directory.
import urllib.request
import requests
import time
#Define the URL to download the .csv from.
url_begin = "http://real-chart.finance.yahoo.com/table.csv?s="
url_end = "&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv"
#Function that reads all available ticker symbols from ticker_daten.txt. This file should be in the same directory as the program.
def readTickers(file):
read_ticker = []
ins = open( file, "r" )
for line in ins:
if line.endswith('\n'):
line=line[:-1]
read_ticker.append(line)
ins.close()
return read_ticker
#File location for tickersymbols to download
tickers = readTickers("C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/ticker_daten.txt")
#Loop through list of ticker symbols and download .csv's .
for i in tickers:
#Forge downloadable link.
link_created = url_begin + i + url_end
#Make sure that the link actually leads to a file.
try:
r = requests.head(link_created)
if r.status_code==404:
print(str(r.status_code)+": No page found!")
time.sleep(0.5)
else:
print(link_created)
#Finally download the file, if it does exist.
urllib.request.urlretrieve(link_created, "C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/data/"+i+".csv")
time.sleep(0.5)
except requests.ConnectionError:
#A Connection error occurred.
print ("ConnectionError: 404 No page found!")
except requests.HTTPError:
#An HTTP error occurred.
print ("HTTPError!")
except requests.Timeout:
#Connection timed out.
print ("Timeout!")
The Problem: The script crashes randomlyafter loading between 20-1750 .csv's. The crash produces the following output.
Process started >>>
http://real-chart.finance.yahoo.com/table.csv?s=0055.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0056.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0057.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0058.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
Traceback (most recent call last):
File "Stock-Price Leecher.py", line 40, in <module>
urllib.request.urlretrieve(link_created, "C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/data/"+i+".csv")
File "c:\Python34\lib\urllib\request.py", line 178, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "c:\Python34\lib\urllib\request.py", line 153, in urlopen
return opener.open(url, data, timeout)
File "c:\Python34\lib\urllib\request.py", line 461, in open
response = meth(req, response)
File "c:\Python34\lib\urllib\request.py", line 571, in http_response
'http', request, response, code, msg, hdrs)
File "c:\Python34\lib\urllib\request.py", line 499, in error
return self._call_chain(*args)
File "c:\Python34\lib\urllib\request.py", line 433, in _call_chain
result = func(*args)
File "c:\Python34\lib\urllib\request.py", line 579, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
<<< Process finished. (Exit code 1)
================ READY ================
Does anyone of you have any Idea why this might happen?

Combined Exceptions WIth Beautiful Soup HTTPError Not Defined

I'm trying to write code so that it scrap stock symbols data into a csv file. However, I get the following error.
Traceback (most recent call last):
File "company_data_v3.py", line 23, in <module>
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
File "C:\Python27\lib\urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 410, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 448, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 382, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 400: Bad Request
I have tried this suggestion but it has not worked which imports urlib2 HTTPError into the program. (It seems redundant to do that since I already have the module imported.
The symbols.txt file has stock symbols. Here is the code that I am using:
import urllib2
from BeautifulSoup import BeautifulSoup
import csv
import re
import urllib
from urllib2 import HTTPError
# import modules
symbolfile = open("symbols.txt")
symbolslist = symbolfile.read()
newsymbolslist = symbolslist.split("\n")
i = 0
f = csv.writer(open("pe_ratio.csv","wb"))
# short cut to write
f.writerow(["Name","PE","Revenue % Quarterly","ROA% YOY","Operating Cashflow","Debt to Equity"])
#first write row statement
# define name_company as the following
while i<len(newsymbolslist):
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
soup = BeautifulSoup(page)
name_company = soup.findAll("div", {"class" : "title"})
for name in name_company: #add multiple iterations?
all_data = soup.findAll('td', "yfnc_tabledata1")
stock_name = name.find('h2').string #find company's name in name_company with h2 tag
try:
f.writerow([stock_name, all_data[2].getText(),all_data[17].getText(),all_data[13].getText(), all_data[29].getText(),all_data[26].getText()]) #write down PE data
except (IndexError, urllib2.HTTPError) as e:
pass
i+=1
Do I need to define the error more specifically? Thanks for your help.
You are catching the exception in the wrong location. The urlopen() call throws the exception, as shown by the first lines of your traceback:
Traceback (most recent call last):
File "company_data_v3.py", line 23, in <module>
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
Catch it there:
while i<len(newsymbolslist):
try:
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
except urllib2.HTTPError:
continue

Python follow redirects and then download the page?

I have the following python script and it works beautifully.
import urllib2
url = 'http://abc.com' # write the url here
usock = urllib2.urlopen(url)
data = usock.read()
usock.close()
print data
however, some of the URL's I give it may redirect it 2 or more times. How can I have python wait for redirects to complete before loading the data.
For instance when using the above code with
http://www.google.com/search?hl=en&q=KEYWORD&btnI=1
which is the equvilant of hitting the im lucky button on a google search, I get:
>>> url = 'http://www.google.com/search?hl=en&q=KEYWORD&btnI=1'
>>> usick = urllib2.urlopen(url)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 400, in open
response = meth(req, response)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 513, in http_response
'http', request, response, code, msg, hdrs)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 438, in error
return self._call_chain(*args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 372, in _call_chain
result = func(*args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 521, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
>>>
Ive tried the (url, data, timeout) however, I am unsure what to put there.
EDIT:
I actually found out if I dont redirect and just used the header of the first link, I can grab the location of the next redirect and use that as my final link
Use requests as the other answer states, here is an example. The redirect will be in r.url. In the example below the http is redirected to https
For HEAD:
In [1]: import requests
...: r = requests.head('http://github.com', allow_redirects=True)
...: r.url
Out[1]: 'https://github.com/'
For GET:
In [1]: import requests
...: r = requests.get('http://github.com')
...: r.url
Out[1]: 'https://github.com/'
Note for HEAD you have to specify allow_redirects, if you don't you can get it in the headers but this is not advised.
In [1]: import requests
In [2]: r = requests.head('http://github.com')
In [3]: r.headers.get('location')
Out[3]: 'https://github.com/'
To download the page you will need GET, you can then access the page using r.content
You might be better off with Requests library which has better APIs for controlling redirect handling:
https://requests.readthedocs.io/en/master/user/quickstart/#redirection-and-history
Requests:
https://pypi.org/project/requests/ (urllib replacement for humans)

Categories

Resources