How to scrape pdf's that are embedded with BeautifulSoup - python

I am trying to scrape this page recursively using BeautifulSoup.
The problem however is that the pdf links actually open a new page on which the pdf's are embedded. In this embedded page we can subsequently find the true pdf links from the embedded tag.
I added therefore a line to check if the content is of the application/pdf. However using the redirect url, I am unable to extract the pdf links from this new page with the embedded pdf.
I tried the following but this did not work (a valid pdf link is never found)
# run the following in a .py file:
# spider = fdb.OurSpider()
# spider.scrape_page(url=url)
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests import get
import time
MAX_DEPTH = 10
class OurSpider:
def __init__(self):
"""Init our Custom Spider"""
def scrape_page(self, url):
"""Scrape page"""
try:
self.download_pdfs(url=url)
except requests.exceptions.MissingSchema:
print(f'skipped MissingSchema [{url}]')
try:
links = self.get_links(url=url)
print(links)
except:
print('')
def download_pdfs(self, url, depth=1):
# If there is no such folder, the script will create one automatically
print('')
print(f'--- [{depth}] {url}')
if depth > MAX_DEPTH:
return 'max depth reached'
soup = self.get_soup(url=url)
links = soup.select("a[href$='.pdf']")
for link in links:
try:
full_url = urljoin(url, link['href'])
content = get(full_url)
if content.status_code == 200 and content.headers['content-type'] == 'application/pdf':
self.download_pdf(full_url=full_url)
elif full_url != url:
self.download_pdfs(url=full_url, depth=depth+1)
else:
print('skipping url')
except requests.exceptions.InvalidSchema:
print(f'skipped InvalidSchema [{link}]')
print('--- downloading pdfs done')
def download_pdf(self, full_url):
"""Download single url"""
filename = "".join(['tmp/', str(return round(time.time() * 1000)), '.pdf'])
if not self.file_exists(filename=filename):
print(f'{filename}: {full_url}')
with open(filename, 'wb') as f:
f.write(requests.get(full_url).content)
def get_links(self, url):
"""Get the links given the url"""
soup = self.get_soup(url=url)
return soup.findAll('a', href=True)
#staticmethod
def file_exists(filename):
"""File exists locally"""
return os.path.exists(filename)
#staticmethod
def get_soup(url):
"""Init the url"""
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
return soup

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from urllib.parse import unquote
site = "https://www.masked.com/us/individual/resources/regulatory-documents/mutual-funds"
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [f"{url[:25]}{item.get('href')}"
for item in soup.findAll("a", title="Annual Report")]
return target
def parse(url):
with requests.Session() as req:
r = req.get(url)
match = [unquote(f"{r.url[:25]}{match.group(1)}") for match in re.finditer(
r"Override=(.+?)\"", r.text)]
return match
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(parse, url) for url in main(site)]
links = []
for future in futures:
links.extend(future.result())
print(f"Collected {len(links)}")
def download(url):
with requests.Session() as req:
r = req.get(url)
if r.status_code == 200 and r.headers['Content-Type'] == "application/pdf;charset=UTF-8":
name = r.url.rfind("/") + 1
name = r.url[name:]
return f"Saving {name}"
with open(f"{name}", 'wb') as f:
f.write(r.content)
else:
pass
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(download, url) for url in links]
for future in as_completed(futures):
print(future.result())

Related

Unable to scrape emails from some websites maybe due to r.html.render() not working properly

I have some website links as samples for extracting any email available in their internal sites.
However, even I am trying to render any JS driven website via r.html.render() within scrape_email(url) method, some of the websites like arken.trygge.dk, gronnebakken.dk, dagtilbud.ballerup.dk/boernehuset-bispevangen etc. does not return any email which might be due to rendering issue.
I have attached the sample file for convenience of running
I dont want to use selenium as there can be thousands or millions of webpage I want to extract emails from.
So far this is my code:
import os
import time
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
import pandas as pd
from gtts import gTTS
import winsound
# For convenience of seeing console output in the script
pd.options.display.max_colwidth = 180
#Get the start time of script execution
startTime = time.time()
#Paste file name inside ''
input_file_name = 'sample'
input_df = pd.read_excel(input_file_name+'.xlsx', engine='openpyxl')
input_df = input_df.dropna(how='all')
internal_urls = set()
emails = set()
total_urls_visited = 0
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_internal_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
print("Domain name -- ",domain_name)
try:
soup = BeautifulSoup(requests.get(url, timeout=5).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if parsed_href.netloc != domain_name:
# if the link is not of same domain pass
continue
if parsed_href.path.endswith((".csv",".xlsx",".txt", ".pdf", ".mp3", ".png", ".jpg", ".jpeg", ".svg", ".mov", ".js",".gif",".mp4",".avi",".flv",".wav")):
# Overlook site images,pdf and other file rather than webpages
continue
print(f"Internal link: {href}")
urls.add(href)
internal_urls.add(href)
return urls
except requests.exceptions.Timeout as err:
print("The website is not loading within 5 seconds... Continuing crawling the next one")
pass
except:
print("The website is unavailable. Continuing crawling the next one")
pass
def crawl(url, max_urls=30):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"Crawling: {url}")
links = get_internal_links(url)
# for link in links:
# if total_urls_visited > max_urls:
# break
# crawl(link, max_urls=max_urls)
def scrape_email(url):
EMAIL_REGEX = r'\b[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
# EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
try:
# initiate an HTTP session
session = HTMLSession()
# get the HTTP Response
r = session.get(url, timeout=10)
# for JAVA-Script driven websites
r.html.render()
single_url_email = []
for re_match in re.finditer(EMAIL_REGEX, r.html.raw_html.decode()):
single_url_email.append(re_match.group().lower())
r.session.close()
return set(single_url_email)
except:
pass
def crawl_website_scrape_email(url, max_internal_url_no=20):
crawl(url,max_urls=max_internal_url_no)
each_url_emails = []
global internal_urls
global emails
for each_url in internal_urls:
each_url_emails.append(scrape_email(each_url))
URL_WITH_EMAILS={'main_url': url, 'emails':each_url_emails}
emails = {}
internal_urls = set()
return URL_WITH_EMAILS
def list_check(emails_list, email_match):
match_indexes = [i for i, s in enumerate(emails_list) if email_match in s]
return [emails_list[index] for index in match_indexes]
URL_WITH_EMAILS_LIST = [crawl_website_scrape_email(x) for x in input_df['Website'].values]
URL_WITH_EMAILS_DF = pd.DataFrame(data = URL_WITH_EMAILS_LIST)
URL_WITH_EMAILS_DF.to_excel(f"{input_file_name}_email-output.xlsx", index=False)
How can I solve the issue of not being able to scrape email from some of those above-mentioned and similar type of websites?
Is there also any way to detect and print strings if my get request is refused by bot detector or related protocols?
Also how can I make this code more robust?
Thank you in advance

I'm not able to split my code into functions

I made a code to download pdfs from a website, and it works perfectly, downloading all the PDF's (first code below). However, when I split my code into functions, only two links are inserted into the "papers" list and the execution ends with code zero, but the following warning message appears:
GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
The code that caused this warning is on line 11 of the file C:\Downloads\EditoraCL\download_pdf.py. To get rid of this warning, pass the additional argument 'features="html.parser"' to the BeautifulSoup constructor.
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
FIRST CODE:
import requests
import httplib2
import os
from bs4 import BeautifulSoup, SoupStrainer
papers = []
pdfs = []
http = httplib2.Http()
status, response = http.request('https://www.snh2021.anpuh.org/site/anais')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
papers.append(link['href'])
print(papers)
for x in papers:
if x.endswith('pdf'):
pdfs.append(x)
print(pdfs)
def baixa_arquivo(url, endereco):
resposta = requests.get(url)
if resposta.status_code == requests.codes.OK:
with open(endereco, 'wb') as novo_arquivo:
novo_arquivo.write(resposta.content)
print('Download concluĂ­do. Salvo em {}'.format(endereco))
else:
resposta.raise_for_status()
if __name__ == '__main__':
url_basica = 'https://www.snh2021.anpuh.org/{}'
output = 'Download'
for i in range(1, len(pdfs)):
nome_do_arquivo = os.path.join(output, 'artigo{}.pdf'.format(i))
a = pdfs[i]
z = url_basica.format(a)
y = requests.get(z)
if y.status_code!=404:
baixa_arquivo(z, nome_do_arquivo)
CODE DIVIDED INTO FUNCTIONS:
import requests
import httplib2
import os
from bs4 import BeautifulSoup, SoupStrainer
papers = []
pdfs = []
def busca_links():
http = httplib2.Http()
status, response = http.request('https://www.snh2021.anpuh.org/site/anais')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
papers.append(link['href'])
return papers
def links_pdf():
for x in papers:
if x.endswith('pdf'):
pdfs.append(x)
return pdfs
def baixa_arquivo(url, endereco):
resposta = requests.get(url)
if resposta.status_code == requests.codes.OK:
with open(endereco, 'wb') as novo_arquivo:
novo_arquivo.write(resposta.content)
return f'Download concluĂ­do. Salvo em {endereco}'
else:
resposta.raise_for_status()
if __name__ == '__main__':
busca_links()
links_pdf()
url_basica = 'https://www.snh2021.anpuh.org/{}'
output = 'Download'
print(papers)
print(pdfs)
for i in range(1, len(pdfs)):
nome_do_arquivo = os.path.join(output, 'artigo{}.pdf'.format(i))
a = pdfs[i]
z = url_basica.format(a)
y = requests.get(z)
if y.status_code!=404:
baixa_arquivo(z, nome_do_arquivo)
Could someone help me understand why the second code is giving this error?
Functions do not share their inner variables, so in order to make your code work, you should assign "papers" to the function itself, after returning it inside the function ( papers = busca_links() and links_pdf(papers) ).
Anyway, for the purpose of organization and clearer code, you should use classes and methods:
import os
import requests
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
class Pdf:
def __init__(self, base_url, url):
self.main_dir = os.path.dirname(__file__)
self.pdfs_dir = os.path.join(self.main_dir, 'pdfs')
self.base_url = base_url
self.url = url
def get_links(self):
http = httplib2.Http()
status, response = http.request(self.url)
self.links = []
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if link['href'].endswith('pdf'):
self.links.append(f"{self.base_url}{link['href']}")
def download_pdf(self):
for link in self.links:
response = requests.get(link, stream=True)
if response.status_code == 200:
file_path = os.path.join(self.pdfs_dir, link.split('/')[-1])
with open(file_path, 'wb') as f:
f.write(response.content)
print('Success. Saved on {}'.format(file_path))
else:
# Should handle errors here, by appending them to a list and
# trying again later.
print('Error.')
if __name__ == '__main__':
base_url = 'https://www.snh2021.anpuh.org/'
url = f'{base_url}site/anais'
pdf = Pdf(base_url, url)
pdf.get_links()
pdf.download_pdf()

Unable to return all the results at once

I've written a script in python to fetch some links from a webpage. There are two functions within my script. The first function collect links to the local businesses from a webpage and the second function traverses those links and collect urls to the various events.
When I try with the script found here, I get desired results.
How can I return all the results complying the below design?
The following script return the results of individual links whereas I wish to return all the result at once keeping the design as it is (logic may vary).
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
linklist = []
def collect_links(link):
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
items = [urljoin(url,item.get("href")) for item in soup.select(".business-listings-category-list .field-content a[hreflang]")]
return items
def fetch_info(ilink):
res = requests.get(ilink)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".business-teaser-title a[title]"):
linklist.append(urljoin(url,item.get("href")))
return linklist
if __name__ == '__main__':
url = "https://www.parentmap.com/atlas"
for itemlink in collect_links(url):
print(fetch_info(itemlink))
First of all I removed the global linklist as it is returned from the function anyway, and keeping global creates overlapping results. Next I added a function to "assemble" the links the way you wanted. I used a set to prevent duplicate links.
#!/usr/bin/python
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def collect_links(link):
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
items = [urljoin(url,item.get("href")) for item in soup.select(".business-listings-category-list .field-content a[hreflang]")]
return items
def fetch_info(ilink):
linklist = []
res = requests.get(ilink)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".business-teaser-title a[title]"):
linklist.append(urljoin(url,item.get("href")))
return linklist
def fetch_all_links(url):
links = set()
for itemlink in collect_links(url):
links.update(fetch_info(itemlink))
return list(links)
if __name__ == '__main__':
url = "https://www.parentmap.com/atlas"
print(fetch_all_links(url))
The main reason you are getting results one after another is you are calling fetchinfo in a loop which calls function, again and again, resulting in printing data one after another rather than using a loop in fetchinfo function.Try code below
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
linklist = []
def collect_links(link):
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
items = [urljoin(url,item.get("href")) for item in soup.select(".business-listings-category-list .field-content a[hreflang]")]
return items
def fetch_info(url):
for itemlink in collect_links(url):
res = requests.get(ilink)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".business-teaser-title a[title]"):
linklist.append(urljoin(url,item.get("href")))
return linklist
if __name__ == '__main__':
url = "https://www.parentmap.com/atlas"
for itemlink in collect_links(url):
print(fetch_info(itemlink))

I am using BeautifulSoup, how can I get the link after the redirect?

I want to get the link after the redirect of the download link in the article page.
For example:
https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/
In the above article page, there are the following download links:
https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/yz5cw79mbn3a/ECNHOgoNYk0MIkEoFlUkFlY5Vj5WVSRQACVKfx8EOw8ReVs+FFs=
Open this link directly, it will not redirect to the real download link, you need to open it in the article page.
# coding=utf-8
import lxml
import re
import requests
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen
def urlopen(url):
'''
using requests to replace urllib.requests.urlopen
return an html
'''
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
return r.text
def generate_pages(subTitle,fromPage,toPage):
'''
return page sites' url list
'''
pages = []
if(fromPage > 0 and fromPage<toPage):
for i in range(fromPage,toPage+1):
pages.append('https://scanlibs.com/category/books'+subTitle+'/page/'+str(i))
return pages
def get_book_sites_of_one_page(page):
'''
get book site's url in one page
input: page site url
output: book site urls list
return book sites in one page
'''
html = urlopen(page)
soup = BeautifulSoup(html,'html.parser')
linkList = soup.find('main').findAll('a',{'rel':'bookmark'})
bookSites= []
for link in linkList[::2]:
if 'href' in link.attrs:
#print(link)
bookSites.append(link.attrs['href'])
return bookSites
def get_book_urls(bookSite):
'''
input a book site
find book downloading urls in this book site
then
return them as a list
'''
bookURLs=[]
html = urlopen(bookSite)
soup = BeautifulSoup(html,'lxml')
linkList = soup.findAll("a",{"target":"_blank"})
for link in linkList[::2]:
# print(link)
if 'href' in link.attrs:
bookURLs.append(link.attrs['href'])
return bookURLs
def get_all_book_urls(fromPage=1, toPage=1, subTitle=''):
bookSites = []
bookURLs = []
pages = generate_pages(subTitle,fromPage, toPage)
for page in pages:
bookSiteOfOnePage=get_book_sites_of_one_page(page)
bookSites.extend(bookSiteOfOnePage)
for bookSite in bookSites:
book_urls=get_book_urls(bookSite)
bookURLs += book_urls
for bookURL in bookURLs:
print(bookURL)
#with open(filename, 'w') as f:
# f.write(bookURLs)
def main():
if(len(sys.argv) == 4):
'''
python getUrl.py 1, 100, programming
from page 1 to page in subject programming
'''
subTitle = str(sys.argv[3])
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
get_all_book_urls(fromPage, toPage, subTitle)
if(len(sys.argv) == 3):
'''
python getUrl.py 1 100
from page 1 to page 100
'''
subTitle = ''
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
#filename = subTitle="-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
elif(len(sys.argv) == 2):
'''
python getUrl.py 10
from page 10 to page 10
only download books on page 10
'''
fromPage = int(sys.argv[1])
toPage = fromPage + 1
subTitle = ''
#filename = "All-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
elif(len(sys.argv)== 1):
fromPage = 1
# custom page range
toPage = 2
subTitle = ''
#filename = "All-"+"1"+"-"+time.strftime('%Y-%m-%d', time.localtime())+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
else:
print("Error, too many arguments")
if __name__ == '__main__':
#filename = ''
main()
Thank you for your help!
This website checks if the referer is set while redirecting. You can just give the original url as referer in the header and easily bypass this. You can also see that the referer is used as a url parameter in the final download link.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
url='https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/'
r=html=s.get(url).text
soup=BeautifulSoup(html,'html.parser')
relative_link=soup.find('a',{'id':'download'})['href'] #get the relative link
download_redirect_link=url+relative_link
headers={
"referer": url
}
r2=requests.get(download_redirect_link,headers=headers)
print(r2.url)
Output
https://rapidgator.net/file/80e881f7631eddb49de31e5718eb96ba?referer=https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/

beautifulsoup and request.post

I practice scraping one site.
I got some mysterious situation.
import requests
from bs4 import BeautifulSoup
import json
class n_auction(object):
def __init__(self):
self.search_request = {
'lawsup':0,
'lesson':0,
'next_biddate1':'',
'next_biddate2':'',
'state':91,
'b_count1':0,
'b_count2':0,
'b_area1':'',
'b_area2':'',
'special':0,
'e_area1':'',
'e_area2':'',
'si':11,
'gu':0,
'dong':0,
'apt_no':0,
'order':'',
'start':60,
'total_record_val':850,
'detail_search':'',
'detail_class':'',
'recieveCode':'',}
self.headers = {'User-Agent':'Mozilla/5.0',
'Referer':'http://goodauction.land.naver.com/auction/ca_list.php'}
def scrape(self, max_pages):
addr = []
pageno = 0
self.search_request['start'] = pageno
while pageno < max_pages:
payload = json.dumps(self.search_request)
r = requests.post('http://goodauction.land.naver.com/auction/ax_list.php', data=payload ,headers=self.headers)
print(r.text)
s = BeautifulSoup(r.text)
print(s)
if __name__ == '__main__':
scraper = n_auction()
scraper.scrape(30)
when I print(r.text), I got full text.like below picture.
But after passing through beautifulsoup,
I lost some values like below picture.
It's very embarrassing. Help me~~
Switching the parser from the default, lxml, to html.parser worked for me.
Try: s = BeautifulSoup(r.text, 'html.parser')

Categories

Resources