How to download all pdf files from multiple urls python - python

Using Python, I'd like to download all pdf files(except names that begin by "INS") from website
url_asn="https://www.asn.fr/recherche?filter_year[from]={}&filter_year[to]={}&limit=50&search_content_type=&search_text={}&sort_type=date&page={}"
if link['href'] is not pdf, then open it and download pdf files if they exist - for each page, interate to last page.

probably this will work?
I have added comments for every line.
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = " " # url to scrape
#If there is no such folder, the script will create one automatically
folder_location = r'/webscraping' # folder location
# create folder if it doesn't exist
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url) # get the html
soup= BeautifulSoup(response.text, "html.parser") # parse the html
for link in soup.select("a[href$='.pdf']"): # select all the pdf links
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1]) # join the folder location and the filename
with open(filename, 'wb') as f:
# open the file and write the pdf
f.write(requests.get(urljoin(url,link['href'])).content)

Related

How to select and download only specific PDF from a website?

I found some code online that allows you to download all the PDF found from a url and it works, but it fails on the website I need it for. Im trying to download the PDF of the menu for each day of the week and I can't seem to figure out how to narrow it down to only those 7 pdf files.
from bs4 import BeautifulSoup
import requests
url = "https://calbaptist.edu/dining/alumni-dining-commons"
# Requests URL and get response object
response = requests.get(url)
# Parse text obtained
soup = BeautifulSoup(response.text, 'html.parser')
# Find all hyperlinks present on webpage
links = soup.find_all('a')
i = 0
# From all links check for pdf link and
# if present download file
for link in links:
if (".pdf" in link.get('href', [])):
i += 1
print("Downloading file: ", i)
# Get response object for link
response = requests.get(link.get('href'))
# Write content in pdf file
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(response.content)
pdf.close()
print("File ", i, " downloaded")
print("All PDF files downloaded")
I tried to change the if-statement to instead of looking for .pdf to look for /dining/menus-and-hours/adc-menus/. This gave me an error on the line that gets the responce object for the link.
Check the href values, they are relative and not absolute, so you have to prepend the "base url".
You could also select your elements more specific with css selector like contains something:
soup.select('a[href*="/dining/menus-and-hours/adc-menus/"]')
or ends with .pdf
soup.select('a[href$=".pdf"]')
May also take a look at enumerat():
for i,e in enumerate(soup.select('a[href*="/dining/menus-and-hours/adc-menus/"]'),start=1):
Checking content type of reponse header:
requests.get('https://calbaptist.edu'+e.get('href')).headers['Content-Type']
Example
from bs4 import BeautifulSoup
import requests
url = "https://calbaptist.edu/dining/alumni-dining-commons"
soup = BeautifulSoup(requests.get(url).text)
for i,e in enumerate(soup.select('a[href*="/dining/menus-and-hours/adc-menus/"]'),start=1):
r = requests.get('https://calbaptist.edu'+e.get('href'))
if r.headers['Content-Type'] == 'application/pdf':
pdf = open("pdf"+str(i)+".pdf", 'wb')
pdf.write(r.content)
pdf.close()
print("File ", i, " downloaded")

Trying to webscrape multiple pdf files with Beautiful Soup: it downloads only a few pdf for no apparent reason

I'm trying to scrape multiple pdf files from a web of a local council. I'm using a code I saw on YouTube, which I have tested in other pages where it works well. However, when I try it in the webpage I'm interested in, it downloads only a few pdfs and then the program stops. This webpage uses a year filter, which starts at 2022. However, when I run the program, only pdfs from 2007 are dowloaded for no apparent reasons. I cannot grasp whats happening.
Here is the code:
import os
def extract_url_pdf(input_url,folder_path=os.getcwd()):
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import datetime
url = input_url
#If there is no such folder, the script will create one automatically
folder_location = 'D:/Datos/Ordenanzas municipales/Municipalidad'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
link_text=list()
link_href=list()
link_file=list()
counter=0
for link in soup.select("a[href$='.pdf']"):
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
link_text.append(str(link.text))
link_href.append(link['href'])
link_file.append(link['href'].split('/')[-1])
counter+=1
print(counter, "-Files Extracted from URL named ",link['href'].split('/')[-1])
extract_url_pdf(input_url="https://munihuamanga.gob.pe/normas-legales/ordenanzas-municipales/")
If your goal is to download pdf. It's easiest and best to just download them from the repository on the site. Just specify year in function
def download_pdf(year):
url = f'https://munihuamanga.gob.pe/Documentos_mph/Munitransparencia/Normas_legales/Ordenanzas_municipales/{year}/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
for filename in [href.get('href') for href in soup.find_all('a') if '.pdf' in href.get('href')]:
with open(filename, 'wb') as f:
f.write(requests.get(url + filename).content)
print(f'{filename} was loaded')
download_pdf(2022)
OUTPUT:
o_m_01_emitido_17022022.pdf was downloaded
o_m_02_emitido_08032022.pdf was downloaded
o_m_03_emitido_04042022.pdf was downloaded
o_m_04_emitido_04042022.pdf was downloaded
o_m_04_emitido_04042022p.pdf was downloaded
o_m_05_emitido_08042022.pdf was downloaded
o_m_06_emitido_13042022.pdf was downloaded
o_m_07_emitido_13052022.pdf was downloaded

Issue downloading multiple PDFs

After running the following code, I am unable to open the downloaded PDF's. Even though the code ran successfully, the downloaded PDF files are damaged.
My computer's error message is
Unable to open file. it may be damaged or in a format Preview doesn't recognize.
Why are they damaged and how do I solve this?
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://github.com/sonhuytran/MIT8.01SC.2010F/tree/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual"
#If there is no such folder, the script will create one automatically
folder_location = r'/Users/rahelmizrahi/Desktop/ Physics_Solutions'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
This issue is you are requesting the link that is within github 'blob' when you need the the 'raw' link:
'/sonhuytran/MIT8.01SC.2010F/blob/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf'
but you want:
'/sonhuytran/MIT8.01SC.2010F/raw/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf'
So just adjust that. Full code below:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://github.com/sonhuytran/MIT8.01SC.2010F/tree/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual"
#If there is no such folder, the script will create one automatically
folder_location = r'/Users/rahelmizrahi/Desktop/Physics_Solutions'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
pdf_link = link['href'].replace('blob','raw')
pdf_file = requests.get('https://github.com' + pdf_link)
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(pdf_file.content)
I had to use soup.select("a[href$=.pdf]") (without the inner quotes) to get it to select the links correctly.
After that, your script works, but: what you're downloading is not a PDF, but an HTML webpage! Try visiting one of the URLs: https://github.com/sonhuytran/MIT8.01SC.2010F/blob/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf
You'll be presented with a GitHub webpage, not the actual PDF. To get that, you need the "raw" GitHub URL, which you can see when you hover over the Download button: https://github.com/sonhuytran/MIT8.01SC.2010F/raw/master/References/University%20Physics%20with%20Modern%20Physics%2C%2013th%20Edition%20Solutions%20Manual/A01_YOUN6656_09_ISM_FM.pdf
So, it looks like you just have to replace blob with raw at the proper spot to make it work:
href = link['href']
href = href.replace('/blob/', '/raw/')
requests.get(urljoin(url,href).content)
The issue is that the file is not properly closed after the open/write.
Just add f.close() at the end of the code to do that.

Automate download all links (of PDFs) inside multiple pdf files

I'm trying to download journal issues from a website (http://cis-ca.org/islamscience1.php). I ran something to get all the PDF's on this page. However these PDF's have links inside them that link to another PDF.
I want to get the terminal articles from all the PDF links.
Got all the PDF's from the page: http://cis-ca.org/islamscience1.php
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://cis-ca.org/islamscience1.php"
#If there is no such folder, the script will create one automatically
folder_location = r'webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
I'd like to get the articles linked inside these PDF's.
Thanks in advance
https://mamclain.com/?page=Blog_Programing_Python_Removing_PDF_Hyperlinks_With_Python
Take a look at this link. It shows how to identify hyperlink and sanitize the PDF document. You could follow it upto the identification part and then perform an operation to store the hyperlink instead of sanitizing.
Alternatively, take a look at this library: https://github.com/metachris/pdfx

How to loop through .txt file links on website, scrape, and store in one malleable csv/excel file

I want to be able to scrape the data from a particular website (https://physionet.org/challenge/2012/set-a/) and the subdirectories like it, while also taking each text file and adding it to a giant csv or excel file so that I might be able to see all the data in one place.
I have deployed the following code, similar to this article, but my code basically downloads all the text files on the page, and stores them in my working directory. And, it honestly just takes too long to run.
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
url = 'https://physionet.org/challenge/2012/set-a/'
response = requests.get(url)
response # 200 indicates that it works...
soup = BeautifulSoup(response.text, "html.parser")
for i in range(5,len(soup.findAll('a'))+1): #'a' tags are for links
one_a_tag = soup.findAll('a')[i]
link = one_a_tag['href']
download_url = 'https://physionet.org/challenge/2012/set-a/'+ link
urllib.request.urlretrieve(download_url,'./'+link[link.find('/132539.txt')+1:])
time.sleep(1) #pause the code for a sec
Actual results are just a bunch of text files crowding my working directory, but before the for loop stops, I'd like to put it in one large csv file format.
If you want to save them, but have to do it a bit at a time (maybe you don't have enough RAM to hold everything in at once), then I would just append the files to a master file one by one.
import requests
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
output_file = 'outputfile.txt'
url = 'https://physionet.org/challenge/2012/set-a/'
# Download and find all the links. Check the last 4 characters to verify it's one
# of the files we are looking for
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.text, "html.parser")
links = [a['href'] for a in soup.find_all('a') if a['href'][-4:] == '.txt']
# Clear the current file
with open(output_file, 'w'):
pass
# Iterate through all the links
for href in links:
response = requests.get("{}{}".format(url, href), verify=False)
if response:
# Open up the output_file in append mode so we can just write to the one file
with open(output_file, 'a') as f:
f.write(response.text)
print(len(response.text.split('\n')))
The one downside to this is that you would have the headers from each text file. but you can change the f.write() to the following and get it without any headers
f.write("\n".join(response.text.split('\n')[1:]))
If you do have the available RAM, you could read in all the files using a list comprehension then use pandas.concat() to put them in one giant dataframe. Then use df.to_csv() to export it to a file.
df = pd.concat([pd.read_csv("{}{}".format(url, href)) for href in links])
df.to_csv(output_file)

Categories

Resources