Download pdf from all pages of one website with using Python - python

I need a help with one code. I want to download pdf from all pages of "/#documentu", not only one. But I don't want to write all these links in code. It must be an automatic parsing. Here's the code:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import webbrowser
import urllib
import urllib.request
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
#Need to download from all pages that has a files and page "/#documentu"
#but without writing all links in code. It must be an automatic.
urlpage = "https://fasie.ru/programs/programma-innoshkolnik/#documentu"
#If there is no such folder, the script will create one automatically
folder_location = r'C:\Download'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(urlpage)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(urlpage,link['href'])).content)
path = r'C:\Download'
i = 1
for file_name in os.listdir(path):
base_name, ext = os.path.splitext(file_name)
abs_file_name = os.path.join(path, file_name)
new_abs_file_name = os.path.join(path, str(i) + ext)
os.rename(abs_file_name, new_abs_file_name)
i += 1
Need help.
Need an automatic parser

Related

Downloading PDFs from a Website using Python

I am completing a Masters in Data Science. I am working on a Text Mining assignment. In this project, I intend to download several PDFs from a website. In this case, I want to scrape and save the document called "Prospectus".
Below is the code which I am using in Python. The prospectus which I wish to download is show in screenshot below. However, the script returns different documents on the web page. Is there something which I need to change within my script?
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://www.ishares.com/us/products/239726/ishares-core-sp-500-etf"
# If there is no such folder, the script will create one automatically
folder_location = r'.\Output'
if not os.path.exists(folder_location): os.mkdir(folder_location)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
# Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location, link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url, link['href'])).content)
Try:
import re
import requests
import urllib.parse
from bs4 import BeautifulSoup
url = "https://www.ishares.com/us/products/239726/ishares-core-sp-500-etf"
html = requests.get(url).text
ajax_url = (
"https://www.ishares.com"
+ re.search(r'dataAjaxUrl = "([^"]+)"', html).group(1)
+ "?action=ajax"
)
soup = BeautifulSoup(requests.get(ajax_url).content, "html.parser")
prospectus_url = (
"https://www.ishares.com"
+ soup.select_one("a:-soup-contains(Prospectus)")["href"]
)
pdf_url = (
"https://www.ishares.com"
+ urllib.parse.parse_qs(prospectus_url)["iframeUrlOverride"][0]
)
print("Downloading", pdf_url)
with open(pdf_url.split("/")[-1], "wb") as f_out:
f_out.write(requests.get(pdf_url).content)
Prints:
Downloading https://www.ishares.com/us/literature/prospectus/p-ishares-core-s-and-p-500-etf-3-31.pdf
and saves p-ishares-core-s-and-p-500-etf-3-31.pdf:
-rw-r--r-- 1 root root 325016 okt 17 22:31 p-ishares-core-s-and-p-500-etf-3-31.pdf

Downloading pdf from URL with urllib (getting weird html output instead)

I am trying to download pdfs from several pdf urls.
An example: https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf
This url directly opens into the PDF on my browser.
However, when I use this code to download it using the link, it returns an HTML file given below.
link = "https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf"
urllib.request.urlretrieve(link, f"/content/drive/MyDrive/Research/pdfs/1.pdf")
The resulting "pdf" file or HTML code file is downloaded instead:
How do I solve this issue? Appreciate any help, thanks!
You can use BeautifulSoup or lxml to find <iframe> and get src - and use it to download file
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as BS
url = 'https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf'
response = urllib.request.urlopen(url)
soup = BS(response.read(), 'html.parser')
iframe = soup.find('iframe')
url = iframe['src']
filename = urllib.parse.unquote(url)
filename = filename.rsplit('/', 1)[-1]
urllib.request.urlretrieve(url, filename)
Eventually you can check few file to see if all use the same https://d2x0djib3vzbzj.cloudfront.net/ and simply replace it in url.
import urllib.request
import urllib.parse
url = 'https://www.fasb.org/page/showpdf?path=0001-%201700-UFI%20AICPA%20ACSEC%20Hanson.pdf'
url = url.replace('https://www.fasb.org/page/showpdf?path=',
'https://d2x0djib3vzbzj.cloudfront.net/')
filename = urllib.parse.unquote(url)
filename = filename.rsplit('/', 1)[-1]
urllib.request.urlretrieve(url, filename)

Web Scraping - Python - need assistance

This is my first time posting so apologies if there is any errors. I currently have a file with a list of URLs, and I am trying to create a python program which will go to the URLs and grab the text from the HTML page and save it in a .txt file. I am currently using beautifulsoup to scrape these sites and many of them are throwing errors which I am unsure how to solve. I am looking for a better way to this: I have posted by code below.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from urllib.request import Request
import datefinder
from dateutil.parser import parse
import json
import re
import random
import time
import scrapy
import requests
import urllib
import os.path
from os import path
#extracts page contents using beautifulSoup
def page_extract(url):
req = Request(url,
headers={'User-Agent': 'Mozilla/5.0'})
webpage = uReq(req, timeout=5).read()
page_soup = soup(webpage, "lxml")
return page_soup
#opens file that contains the links
file1 = open('links.txt', 'r')
lines = file1.readlines()
#for loop that iterates through the list of urls I have
for i in range(0, len(lines)):
fileName = str(i)+".txt"
url = str(lines[i])
print(i)
try:
#if the scraping is successful i would like it to save the text contents in a text file with the text file name
# being the index
soup2 = page_extract(url)
text = soup2.text
f = open("Politifact Files/"+fileName,"x")
f.write(str(text))
f.close()
print(url)
except:
#otherwise save it to another folder which contains all the sites that threw an error
f = open("Politifact Files Not Completed/"+fileName,"x")
f.close()
print("NOT DONE: "+url)
Thanks #Thierry Lathuille and #Dr Pi for your response. I was able to find a solution to this problem by looking into python libraries that are able to webscrape the important text off of a webpage. I came across one called 'Trafilatura' which is able to accomplish this task. The documentation for this library is here at: https://pypi.org/project/trafilatura/.

Download a pdf embedded in webpage using python2.7

I want to download the pdf and store it in a folder on my local computer.
Following is the link of pdf i want to download https://ascopubs.org/doi/pdfdirect/10.1200/JCO.2018.77.8738
I have written code in both python selenium and using urllib but both failed to download.
import time, urllib
time.sleep(2)
pdfPath = "https://ascopubs.org/doi/pdfdirect/10.1200/JCO.2018.77.8738"
pdfName = "jco.2018.77.8738.pdf"
f = open(pdfName, 'wb')
f.write(urllib.urlopen(pdfPath).read())
f.close()
It's much easier with requests
import requests
url = 'https://ascopubs.org/doi/pdfdirect/10.1200/JCO.2018.77.8738'
pdfName = "./jco.2018.77.8738.pdf"
r = requests.get(url)
with open(pdfName, 'wb') as f:
f.write(r.content)
from pathlib import Path
import requests
filename = Path("jco.2018.77.8738.pdf")
url = "https://ascopubs.org/doi/pdfdirect/10.1200/JCO.2018.77.8738"
response = requests.get(url)
filename.write_bytes(response.content)

Downloaded videos with urllib and bs4 does not play

I have a folder online where I have video files so I need to download videos from there to my local system so in order to do that I am using Bs4 , urllib modules , The downloaded files are not playable
Please have a look at my code .
from bs4 import BeautifulSoup
import urllib2
import random
import urllib
from urllib2 import urlopen
def download(url):
response = urllib.urlopen("http://ssdolutions/addadsfasdfulsdfaatadfae")
doc = response.read()
soup = BeautifulSoup(doc)
for link in soup.find_all('a'):
x = (link.get('href'))
name = random.randrange(1,10)
full_name = str(name) + ".mp4"
urllib.urlretrieve(url, full_name)
download("http://ssdolutions/addadsfasdfulsdfaatadfae")
Pass every link URL (stored in x variable) to urlretrieve call:
from bs4 import BeautifulSoup
import urllib2
import random
import urllib
import os
from urllib2 import urlopen
def download(url):
response = urllib.urlopen("http://ssdolutions/addadsfasdfulsdfaatadfae")
doc = response.read()
soup = BeautifulSoup(doc)
for link in soup.find_all('a'):
url = (link.get('href'))
# Extract filename from link URL
filename = os.path.basename(url)
file_data = os.path.splitext(filename)
if len(file_data) > 1:
file_ext = file_data[1]
# this will allow you to download links with *.mp4 extension only
if file_ext == ".mp4":
urllib.urlretrieve(url, filename)
download("http://ssdolutions/addadsfasdfulsdfaatadfae")

Categories

Resources