Download a pdf embedded in webpage using python2.7

Download a pdf embedded in webpage using python2.7 - python

I want to download the pdf and store it in a folder on my local computer.
Following is the link of pdf i want to download https://ascopubs.org/doi/pdfdirect/10.1200/JCO.2018.77.8738
I have written code in both python selenium and using urllib but both failed to download.
import time, urllib
time.sleep(2)
pdfPath = "https://ascopubs.org/doi/pdfdirect/10.1200/JCO.2018.77.8738"
pdfName = "jco.2018.77.8738.pdf"
f = open(pdfName, 'wb')
f.write(urllib.urlopen(pdfPath).read())
f.close()

It's much easier with requests
import requests
url = 'https://ascopubs.org/doi/pdfdirect/10.1200/JCO.2018.77.8738'
pdfName = "./jco.2018.77.8738.pdf"
r = requests.get(url)
with open(pdfName, 'wb') as f:
f.write(r.content)

from pathlib import Path
import requests
filename = Path("jco.2018.77.8738.pdf")
url = "https://ascopubs.org/doi/pdfdirect/10.1200/JCO.2018.77.8738"
response = requests.get(url)
filename.write_bytes(response.content)

Related

Download pdf from all pages of one website with using Python

I need a help with one code. I want to download pdf from all pages of "/#documentu", not only one. But I don't want to write all these links in code. It must be an automatic parsing. Here's the code:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import webbrowser
import urllib
import urllib.request
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
#Need to download from all pages that has a files and page "/#documentu"
#but without writing all links in code. It must be an automatic.
urlpage = "https://fasie.ru/programs/programma-innoshkolnik/#documentu"
#If there is no such folder, the script will create one automatically
folder_location = r'C:\Download'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(urlpage)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(urlpage,link['href'])).content)
path = r'C:\Download'
i = 1
for file_name in os.listdir(path):
base_name, ext = os.path.splitext(file_name)
abs_file_name = os.path.join(path, file_name)
new_abs_file_name = os.path.join(path, str(i) + ext)
os.rename(abs_file_name, new_abs_file_name)
i += 1
Need help.
Need an automatic parser

Trying to webscrape multiple pdf files with Beautiful Soup: it downloads only a few pdf for no apparent reason

I'm trying to scrape multiple pdf files from a web of a local council. I'm using a code I saw on YouTube, which I have tested in other pages where it works well. However, when I try it in the webpage I'm interested in, it downloads only a few pdfs and then the program stops. This webpage uses a year filter, which starts at 2022. However, when I run the program, only pdfs from 2007 are dowloaded for no apparent reasons. I cannot grasp whats happening.
Here is the code:
import os
def extract_url_pdf(input_url,folder_path=os.getcwd()):
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import datetime
url = input_url
#If there is no such folder, the script will create one automatically
folder_location = 'D:/Datos/Ordenanzas municipales/Municipalidad'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
link_text=list()
link_href=list()
link_file=list()
counter=0
for link in soup.select("a[href$='.pdf']"):
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
link_text.append(str(link.text))
link_href.append(link['href'])
link_file.append(link['href'].split('/')[-1])
counter+=1
print(counter, "-Files Extracted from URL named ",link['href'].split('/')[-1])
extract_url_pdf(input_url="https://munihuamanga.gob.pe/normas-legales/ordenanzas-municipales/")

If your goal is to download pdf. It's easiest and best to just download them from the repository on the site. Just specify year in function
def download_pdf(year):
url = f'https://munihuamanga.gob.pe/Documentos_mph/Munitransparencia/Normas_legales/Ordenanzas_municipales/{year}/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
for filename in [href.get('href') for href in soup.find_all('a') if '.pdf' in href.get('href')]:
with open(filename, 'wb') as f:
f.write(requests.get(url + filename).content)
print(f'{filename} was loaded')
download_pdf(2022)
OUTPUT:
o_m_01_emitido_17022022.pdf was downloaded
o_m_02_emitido_08032022.pdf was downloaded
o_m_03_emitido_04042022.pdf was downloaded
o_m_04_emitido_04042022.pdf was downloaded
o_m_04_emitido_04042022p.pdf was downloaded
o_m_05_emitido_08042022.pdf was downloaded
o_m_06_emitido_13042022.pdf was downloaded
o_m_07_emitido_13052022.pdf was downloaded

Download a zip file from a DL link to a specific folder in Python

I have this download link that I've extracted from a site with the following code:
import urllib
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import re
url = r'https://rawkuma.com/manga/kimetsu-no-yaiba/'
req = Request(url, headers={'User-Agent':'Chrome'})
html = urlopen(req)
soup = BeautifulSoup(html, "html.parser")
body = soup.find_all('body')
content = body[0].find_all('div',{'id':'content'})
ul = content[0].find_all('ul')
chapter = ul[0].find_all('li',{'data-num':'128'})
dt = chapter[0].find_all('div',{'class':'dt'})
a = dt[0].find_all('a')
a = str(a[0])
href = a.split(" ")[2][5:]
Now I want to use that href which is a DL link to a zip file and download it to a specified folder. I've tried something like this:
save_path = r'C:\Users\...'
file_name = r'kimetsu-no-yaiba-chapter-128'
completeName = os.path.join(save_path, file_name+".zip")
file1 = open(completeName, "w")
file1.write(href)
file1.close()
But this seems to just add an empty zip file to the folder. And if I try to open the url first before putting into the write function it gives me an error:
req = Request(href)
r = urlopen(req)
save_path = r'C:\Users\...'
file_name = r'kimetsu-no-yaiba-chapter-128'
completeName = os.path.join(save_path, file_name+".zip")
file1 = open(completeName, "w")
file1.write(r)
file1.close()
But I get this error:
urllib.error.URLError: <urlopen error unknown url type: "https>

The url http://dl.rawkuma.com/?id=86046 is not the actual URI with the zip file, there is a redirect to the real link. So here is the code to download a zip file based on your example. You need to install the requests library to make it easier.
import requests
import os
URL = 'http://dl.rawkuma.com/?id=86046'
res = requests.get(URL, allow_redirects=True)
# this is the actual url for the zip file
print(res.url)
with requests.get(res.url, stream=True) as r:
r.raise_for_status()
print('downloading')
with open(os.path.join('.', 'file.zip'), 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)

Request on url having mandarin character

Trying to fetch content on below url but not working with requests module. although link opens up on browser. How to Get link using requests library
In [2]: requests.get('http://www.dwconstir.com/inc/download.asp?FileName=3Q17%20%uC2E4%uC801PT_ENG.pdf')
Out[2]: <Response [400]>

You are trying to download a PDF. You can do that using urllib2
Sample
import urllib2
src_url = "http://www.dwconstir.com/inc/download.asp?FileName=3Q17%20%uC2E4%uC801PT_ENG.pdf"
path = "DEST_PATH" #Folder location where you want to download the file.
response = urllib2.urlopen(src_url)
file = open(path + "document.pdf", 'wb')
file.write(response.read())
file.close()
Using requests
import requests
url = 'http://www.dwconstir.com/inc/download.asp?FileName=3Q17%20%uC2E4%uC801PT_ENG.pdf'
path = "DEST_PATH" #Folder location where you want to download the file.
r = requests.get(url, stream=True)
with open(path + "document.pdf", 'wb') as f:
f.write(r.content)

How can i download all types of file in python with request library

I am building the crawler in python and i have the list of href from the page.
Now i have the list of file extensions to download like
list = ['zip','rar','pdf','mp3']
How can i save the files from that url to local directory using python
EDIT:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]

Going by your posted example:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
So, the URL you want to fetch next is presumably list_urls[6]['href'].
The first trick is that this might be a relative URL rather than absolute. So:
newurl = list_urls[6]['href']
absurl = urlparse.urljoin(site.url, newurl)
Also, you want to only fetch the file if it has the right extension, so:
if not absurl.endswith(extensions):
return # or break or whatever
But once you've decided what URL you want to download, it's no harder than your initial fetch:
page = urllib2.urlopen(absurl)
html = page.read()
path = urlparse.urlparse(absurl).path
name = os.path.basename(path)
with open(name, 'wb') as f:
f.write(html)
That's mostly it.
There are a few things you might want to add, but if so, you have to add them all manually. For example:
Look for a Content-disposition header with a suggested filename to use in place of the URL's basename.
copyfile from page to f instead of reading the whole thing into memory and then writeing it out.
Deal with existing files with the same name.
…
But that's the basics.

You can use python requests library as you have asked in question : http://www.python-requests.org
You can save file from url like this :
import requests
url='http://i.stack.imgur.com/0LJdh.jpg'
data=requests.get(url).content
filename="image.jpg"
with open(filename, 'wb') as f:
f.write(data)

solution using urllib3
import os
import urllib3
from bs4 import BeautifulSoup
import urllib.parse
url = "https://path/site"
site = urllib3.PoolManager()
html = site.request('GET', url)
soup = BeautifulSoup(html.data, "lxml")
list_urls = soup.find_all('a')
and then a recursive function to get all the files
def recursive_function(list_urls)
newurl = list_urls[0]['href']
absurl = url+newurl
list_urls.pop(0)
if absurl.endswith(extensions): # verify if contains the targeted extensions
page = urllib3.PoolManager()
html = site.request('GET', absurl)
name = os.path.basename(absurl)
with open(name, 'wb') as f:
f.write(html.data)
return recursive_function(list_urls)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Download a pdf embedded in webpage using python2.7 - python

It's much easier with requests import requests url = 'https://ascopubs.org/doi/pdfdirect/10.1200/JCO.2018.77.8738' pdfName = "./jco.2018.77.8738.pdf" r = requests.get(url) with open(pdfName, 'wb') as f: f.write(r.content)

from pathlib import Path import requests filename = Path("jco.2018.77.8738.pdf") url = "https://ascopubs.org/doi/pdfdirect/10.1200/JCO.2018.77.8738" response = requests.get(url) filename.write_bytes(response.content)

Related

Download pdf from all pages of one website with using Python

Trying to webscrape multiple pdf files with Beautiful Soup: it downloads only a few pdf for no apparent reason

Download a zip file from a DL link to a specific folder in Python

Request on url having mandarin character

How can i download all types of file in python with request library

Categories

Resources