I am trying to download all the pdf files from a webpage.
I want to use the h3 tag text as my filename. it works now. Thanks #Gauri Shankar Badola
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://docs.python.org/3/download.html"
#If there is no such folder, the script will create one automatically
folder_location = r'D:/Download'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("div", class_="presentation__content"):
anchor_elements = link.findAll("a", class_="presentation__doc-link")
h3_elements = link.findAll("h3", class_="presentation__title")
if anchor_elements and h3_elements:
pdf_url = anchor_elements[0].attrs['href']
header_text = h3_elements[0].text.strip()
#print (pdf_url)
#print(header_text.replace(" ", "_"))
filename = os.path.join(folder_location, header_text.replace(" ", "_"))
#print (filename)
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,pdf_url)).content)
Instead of fetching all the anchor elements with .pdf ending href, fetch each div which has both the anchor for pdf link and the h3 for display.
Updated code :
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://chemlabs.princeton.edu/macmillan/presentations/"
#If there is no such folder, the script will create one automatically
folder_location = r'D:/download'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
# find all divs with presentation_content class
for link in soup.find_all("div", class_="presentation__content"):
anchor_elements = link.findAll("a", class_="presentation__doc-link")
h3_elements = link.findAll("h3", class_="presentation__title")
if anchor_elements and h3_elements:
pdf_url = anchor_elements[0].attrs['href']
header_text = h3_elements[0].text.strip()
filename = os.path.join(folder_location, header_text)
print (filename)
Output on Windows:
D:/download\Decarboxylative and Decarbonylative Couplings of (Hetero)Aryl Carboxylic Acids and Derivatives
D:/download\Boron Homologation
D:/download\Metal-Organic Frameworks (MOFs)
D:/download\Bioceramic Materials
D:/download\The Olifactory System
D:/download\PROteolysis Targeting Chimera (PROTAC) Targeted Intracellular Protein Degradation
D:/download\High Energy Materials
D:/download\Bioisosteres of Common Functional Groups
D:/download\Halogen Bonding
D:/download\Nonperfect Synchronization
D:/download\Total Syntheses Enabled by Cross Coupling
D:/download\Carbenes: multiplicity and reactivity
D:/download\Selective C-F bond Functionalization in Multifluoroarenes and Trifluoroarenes and Trifluoromethylarenes
D:/download\Proximity- and Affinity- Based Labeling Methods for Interactome Mapping
D:/download\Chemistry of First-Row Transition Metal Photocatalysts
D:/download\Switchable Catalysis
D:/download\Linear Free Energy Relationships
D:/download\Machine Learning
D:/download\Polyoxometalate Photocatalysis
D:/download\Cobalt in Organic Synthesis
D:/download\Metal Nanoparticles in Catalysis
D:/download\Ultrafast Spectroscopic Methods: Fundamental Principles and Applications in Photocatalysis
D:/download\Quantum Dots: Applications in Electron and Energy Transfer Processes
D:/download\PET Imaging
D:/download\Spin-Orbit Coupling and Inorganic Photocatalysts
D:/download\Recent Advances in Cross-Coupling by Manganese Catalysis
D:/download\Recent Developments in Nucleophilic Fluorination
D:/download\Advances in Cancer Immunotherapy
PS : For file saving, replace spaces with hyphens. Also, the base location should have backslash for windows.
Sorry, I didn't see the problem clearly just now. But I'm not familiar with BeautifulSoup. I'll give you another solution.
import os
from simplified_scrapy import SimplifiedDoc,req,utils
url = "http://chemlabs.princeton.edu/macmillan/presentations/"
folder_location = r'D:/download'
if not os.path.exists(folder_location):os.mkdir(folder_location)
html = req.get(url)
doc = SimplifiedDoc(html)
links = doc.selects('a').contains('.pdf',attr='href')
for link in links:
h3 = link.getNext('h3')
filename = os.path.join(folder_location,h3.text)
print (filename)
Related
I am completing a Masters in Data Science. I am working on a Text Mining assignment. In this project, I intend to download several PDFs from a website. In this case, I want to scrape and save the document called "Prospectus".
Below is the code which I am using in Python. The prospectus which I wish to download is show in screenshot below. However, the script returns different documents on the web page. Is there something which I need to change within my script?
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://www.ishares.com/us/products/239726/ishares-core-sp-500-etf"
# If there is no such folder, the script will create one automatically
folder_location = r'.\Output'
if not os.path.exists(folder_location): os.mkdir(folder_location)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
# Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location, link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url, link['href'])).content)
Try:
import re
import requests
import urllib.parse
from bs4 import BeautifulSoup
url = "https://www.ishares.com/us/products/239726/ishares-core-sp-500-etf"
html = requests.get(url).text
ajax_url = (
"https://www.ishares.com"
+ re.search(r'dataAjaxUrl = "([^"]+)"', html).group(1)
+ "?action=ajax"
)
soup = BeautifulSoup(requests.get(ajax_url).content, "html.parser")
prospectus_url = (
"https://www.ishares.com"
+ soup.select_one("a:-soup-contains(Prospectus)")["href"]
)
pdf_url = (
"https://www.ishares.com"
+ urllib.parse.parse_qs(prospectus_url)["iframeUrlOverride"][0]
)
print("Downloading", pdf_url)
with open(pdf_url.split("/")[-1], "wb") as f_out:
f_out.write(requests.get(pdf_url).content)
Prints:
Downloading https://www.ishares.com/us/literature/prospectus/p-ishares-core-s-and-p-500-etf-3-31.pdf
and saves p-ishares-core-s-and-p-500-etf-3-31.pdf:
-rw-r--r-- 1 root root 325016 okt 17 22:31 p-ishares-core-s-and-p-500-etf-3-31.pdf
Currently I have a script that can only download the HTML of a given page.
Now I want to download all the files of the web page including HTML, CSS, JS and image files (same as we get with a ctrl-s of any website).
My current code is:
import urllib
url = "https://en.wikipedia.org/wiki/Python_%28programming_language%29"
urllib.urlretrieve(url, "t3.html")
I visited many questions but they are all only downloading the HTML.
The following implementation enables you to get the sub-HTML websites. It can be more developed in order to get the other files you need. I sat the depth variable for you to set the maximum sub_websites that you want to parse to.
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=2)
print urls
Python3 version, 2019. May this saves some time to somebody:
#!/usr/bin/env python
import urllib.request as urllib2
from bs4 import *
from urllib.parse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print( "Could not open %s" % page)
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=1)
print( urls )
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
Using Python 3+ Requests and other standard libraries.
The function savePage receives a requests.Response and the pagefilename where to save it.
Saves the pagefilename.html on the current folder
Downloads, javascripts, css and images based on the tags script, link and img and saved on a folder pagefilename_files.
Any exception are printed on sys.stderr, returns a BeautifulSoup object .
Requests session must be a global variable unless someone writes a cleaner code here for us.
You can adapt it to your needs.
import os, sys
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def soupfindAllnSave(pagefolder, url, soup, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find): # images, css, etc..
try:
filename = os.path.basename(res[inner])
fileurl = urljoin(url, res.get(inner))
# rename to saved file path
# res[inner] # may or may not exist
filepath = os.path.join(pagefolder, filename)
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
return soup
def savePage(response, pagefilename='page'):
url = response.url
soup = BeautifulSoup(response.text)
pagefolder = pagefilename+'_files' # page contents
soup = soupfindAllnSave(pagefolder, url, soup, 'img', inner='src')
soup = soupfindAllnSave(pagefolder, url, soup, 'link', inner='href')
soup = soupfindAllnSave(pagefolder, url, soup, 'script', inner='src')
with open(pagefilename+'.html', 'w') as file:
file.write(soup.prettify())
return soup
Example saving google page and its contents (google_files folder)
session = requests.Session()
#... whatever requests config you need here
response = session.get('https://www.google.com')
savePage(response, 'google')
Try the Python library Scrapy. You can program Scrapy to recursively scan a website by downloading its pages, scanning, following links:
An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.
I'm trying to extract the language proportion spoken at companies, using python's BeautifulSoup.
Yet, the information seems to come from a script, not from HTML, and I'm having some trouble.
For instance, from the following page, when I try
webpage ="https://www.zippia.com/amazon-com-careers-487/"
page = requests.get(webpage)
soup = BeautifulSoup(page.content, 'lxml')
for links in soup.find_all('div', {'class':'companyEducationDegrees'}):
raw_text = links.get_text()
lines = raw_text.split('\n')
print(lines)
print('-------------------')
I don't get any result while the ideal result should be Spanish 61.1%, French 9,7%, etc
As you already found out the data is put into the page via JS. However, you can still get that data, because the entire data over the comapany is always loaded with the page. You can access this data via requests + BeautifulSoup + json (+ re):
import json
import re
import requests
from bs4 import BeautifulSoup
webpage = "https://www.zippia.com/amazon-com-careers-487/"
page = requests.get(webpage)
soup = BeautifulSoup(page.content, 'lxml')
for script in soup.find_all('script', {'type': 'text/javascript'}):
if 'getCompanyInfo' in script.text:
match = re.search("{[^\n]*}", script.text)
data = json.loads(match.group())
print(data["companyDiversity"]["languages"])
json.dump(data, open("test.json", "w"), indent=2) # Only if you want the data put in a readable format to a file (like if you want to find the path to an entry)
I went on the NYC MTA website to download some turnstile data and came up with a script to download only 2017 data on Python.
Here is the script:
import urllib
import re
html = urllib.urlopen('http://web.mta.info/developers/turnstile.html').read()
links = re.findall('href="(data/\S*17[01]\S*[a-z])"', html)
for link in links:
txting = urllib.urlopen('http://web.mta.info/developers/'+link).read()
lin = link[20:40]
fhand = open(lin,'w')
fhand.write(txting)
fhand.close()
Is there a simpler way to write this script?
As suggested by #dizzyf, you can use BeautifulSoup to get the href values from the web page.
from BS4 import BeautifulSoup
soup = BeautifulSoup(html)
links = [link.get('href') for link in soup.find_all('a')
if 'turnstile_17' in link.get('href')]
If you don't have to do get the files in Python, (and you're on a system with the wget command), you can write the links to a file:
with open('url_list.txt','w') as url_file:
for url in links:
url_file.writeline(url)
Then download them with wget:
$ wget -i url_list.txt
wget -i downloads all the URLs from the file into the current directory, preserving the filenames.
The code below should do what you need.
import requests
import bs4
import time
import random
import re
pattern = '2017'
url_base = 'http://web.mta.info/developers/'
url_home = url_base + 'turnstile.html'
response = requests.get(url_home)
data = dict()
soup = bs4.BeautifulSoup(response.text)
links = [link.get('href') for link in soup.find_all('a',
text=re.compile('2017'))]
for link in links:
url = url_base + link
print "Pulling data from:", url
response = requests.get(url)
data[link] = response.text # I don't know what you want to do with the data so here I just store it to a dict, but you could store it to a file as you did in your example.
not_a_robot = random.randint(2, 15)
print "Waiting %d seconds before next query." % not_a_robot
time.sleep(not_a_robot) # some APIs will throttle you if you hit them too quickly
I am building the crawler in python and i have the list of href from the page.
Now i have the list of file extensions to download like
list = ['zip','rar','pdf','mp3']
How can i save the files from that url to local directory using python
EDIT:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
Going by your posted example:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
So, the URL you want to fetch next is presumably list_urls[6]['href'].
The first trick is that this might be a relative URL rather than absolute. So:
newurl = list_urls[6]['href']
absurl = urlparse.urljoin(site.url, newurl)
Also, you want to only fetch the file if it has the right extension, so:
if not absurl.endswith(extensions):
return # or break or whatever
But once you've decided what URL you want to download, it's no harder than your initial fetch:
page = urllib2.urlopen(absurl)
html = page.read()
path = urlparse.urlparse(absurl).path
name = os.path.basename(path)
with open(name, 'wb') as f:
f.write(html)
That's mostly it.
There are a few things you might want to add, but if so, you have to add them all manually. For example:
Look for a Content-disposition header with a suggested filename to use in place of the URL's basename.
copyfile from page to f instead of reading the whole thing into memory and then writeing it out.
Deal with existing files with the same name.
…
But that's the basics.
You can use python requests library as you have asked in question : http://www.python-requests.org
You can save file from url like this :
import requests
url='http://i.stack.imgur.com/0LJdh.jpg'
data=requests.get(url).content
filename="image.jpg"
with open(filename, 'wb') as f:
f.write(data)
solution using urllib3
import os
import urllib3
from bs4 import BeautifulSoup
import urllib.parse
url = "https://path/site"
site = urllib3.PoolManager()
html = site.request('GET', url)
soup = BeautifulSoup(html.data, "lxml")
list_urls = soup.find_all('a')
and then a recursive function to get all the files
def recursive_function(list_urls)
newurl = list_urls[0]['href']
absurl = url+newurl
list_urls.pop(0)
if absurl.endswith(extensions): # verify if contains the targeted extensions
page = urllib3.PoolManager()
html = site.request('GET', absurl)
name = os.path.basename(absurl)
with open(name, 'wb') as f:
f.write(html.data)
return recursive_function(list_urls)