I am trying to parse the js dynamic page with PyQt5. After execution I get a following error: Process finished with exit code -1073741819 (0xC0000005). Code is here:
import sys
import requests
from bs4 import BeautifulSoup
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
import pandas as pd
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
url = 'https://www.racingpost.com'
page = Page(url)
soup = BeautifulSoup(page.html, 'html.parser')
courses = soup.find_all('a', {'class': 'hidden-sm-down rh-cardsMatrix__courseTitle ui-link'})
for course in courses:
CurrentCourse = course.text.strip()
Courses_URL = url + course.get('href')
response1 = requests.get(Courses_URL)
soup1 = BeautifulSoup(response1.text, 'html.parser')
horses = soup1.find_all('a', {'class': 'RC-runnerName'})
for horse in horses:
horses_url = url + horse.get('href')
page1 = Page(horses_url)
soup2 = BeautifulSoup(page1.html, 'html.parser')
data = soup2.find_all('a', {'class': 'ui-link ui-link_table js-popupLink'})
could you please help me with how to rework, or tweak this code in order to get proper info I want.
Related
`
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
class Scraper:
def __init__(self):
self.driver = webdriver.Chrome(r'C:\Users\gkhat\Downloads\chromedriver.exe')
self.url = "http://www.carwale.com/"
self.href = []
def load_url(self):
self.driver.get(self.url + 'used/cars-for-sale/#sc=-1&so=-1&pn=1')
def scroll_down(self):
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
def read_data(self):
main = self.driver.find_element_by_xpath('/html/body/div[12]/form/section[2]/div[1]/div[4]/div[1]/div[3]/div[2]')
soup = bs(main.get_attribute("innerHTML"), "html.parser")
print(soup)
for elem in soup.findAll('h2', {'class': 'card-detail-block__title'}):
print(elem.a['href'])
self.href.append(str(elem.a['href']))`
This code
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
url = "http://www.carwale.com/used/cars-for-sale/#sc=-1&so=-1&pn=1"
driver.get(url)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
main = driver.find_element_by_xpath('/html/body/div[12]/form/section[2]/div[1]/div[4]/div[1]/div[3]/div[2]')
soup = bs(main.get_attribute("innerHTML"), "html.parser")
for elem in soup.findAll('h2', {'class': 'card-detail-block__title'}):
print(elem.a['href'])
gives this
/used/cars-in-delhi/marutisuzuki-ciaz-2017-2018-d2251253/?slot=1&rk=1&isP=true
/used/cars-in-bangalore/hyundai-i10-2010-2017-d2295085/?slot=2&rk=2&isP=true
/used/cars-in-faridabad/marutisuzuki-alto-k10-d2332673/?slot=3&rk=3&isP=true
/used/cars-in-mumbai/renault-kwid-2015-2019-d2333781/?slot=4&rk=4&isP=true
/used/cars-in-mumbai/hyundai-i10-2010-2017-d2308033/?slot=5&rk=5&isP=true
/used/cars-in-mumbai/toyota-innova-crysta-2016-2020-d2327063/?slot=6&rk=6&isP=true
/used/cars-in-kolkata/landrover-freelander-2-2009-2011-d2321851/?slot=7&rk=7&isP=true
/used/cars-in-pune/hyundai-eon-2011-2019-d2295279/?slot=8&rk=8&isP=true
/used/cars-in-navi-mumbai/renault-kwid-2015-2019-d2303909/?slot=0&rk=9&isP=false
/used/cars-in-navi-mumbai/marutisuzuki-vitara-brezza-2016-2020-d2303835/?slot=0&rk=10&isP=false
/used/cars-in-navi-mumbai/marutisuzuki-swift-2014-2018-d2332235/?slot=0&rk=11&isP=false
/used/cars-in-navi-mumbai/volkswagen-vento-2015-2019-d2285395/?slot=0&rk=12&isP=false
/used/cars-in-navi-mumbai/jeep-compass-2017-2021-d2290491/?slot=0&rk=13&isP=false
/used/cars-in-navi-mumbai/honda-wr-v-2017-2020-d2332263/?slot=0&rk=14&isP=false
/used/cars-in-navi-mumbai/renault-kwid-2015-2019-d2332249/?slot=0&rk=15&isP=false
/used/cars-in-navi-mumbai/volkswagen-polo-2012-2014-d2310489/?slot=0&rk=16&isP=false
/used/cars-in-navi-mumbai/marutisuzuki-a-star-2008-2012-d2324317/?slot=0&rk=17&isP=false
/used/cars-in-navi-mumbai/hyundai-elite-i20-2016-2017-d2281903/?slot=0&rk=18&isP=false
/used/cars-in-navi-mumbai/hyundai-i20-active-2015-2018-d2316481/?slot=0&rk=19&isP=false
/used/cars-in-navi-mumbai/toyota-innova-crysta-2016-2020-d2324333/?slot=0&rk=20&isP=false
/used/cars-in-delhi/ford-figo-2015-2019-d2241843/?slot=0&rk=21&isP=false
/used/cars-in-vadodara/bmw-x3-2011-2014-d2198006/?slot=0&rk=22&isP=false
/used/cars-in-vadodara/volkswagen-jetta20082011-d2197994/?slot=0&rk=23&isP=false
/used/cars-in-vadodara/hyundai-santafe20112014-d2227084/?slot=0&rk=24&isP=false
/used/cars-in-vadodara/skoda-octavia-2013-2015-d2227082/?slot=0&rk=25&isP=false
/used/cars-in-vadodara/bmw-5-series-2007-2010-d2227083/?slot=0&rk=26&isP=false
/used/cars-in-mumbai/honda-cr-v-2013-2018-d2309393/?slot=0&rk=27&isP=false
/used/cars-in-mumbai/bmw-3-series-2007-2009-d2309385/?slot=0&rk=28&isP=false
/used/cars-in-mumbai/skoda-rapid-2014-2015-d2306945/?slot=0&rk=29&isP=false
/used/cars-in-mumbai/audi-a6-2015-2019-d2309389/?slot=0&rk=30&isP=false
/used/cars-in-mumbai/mercedesbenz-e-class-2013-2015-d2321627/?slot=0&rk=31&isP=false
/used/cars-in-delhi/volkswagen-vento-2010-2012-d2287775/?slot=0&rk=32&isP=false
I just removed the print(soup).
After being able to login into a protected website, I'd like the scrape some of the content of thesame webpage that is being loading dynamically. This code block handles the authentication properly but if I try to access a pre tag element with class name lang-py I get None return to me as the output.
import sys
from PyQt5.QtCore import QByteArray, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineCore import QWebEngineHttpRequest
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class Render(QWebEnginePage):
def __init__(self, url):
app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self._html = ""
username = "username"
password = "password"
base64string = QByteArray(("%s:%s" % (username, password)).encode()).toBase64()
request = QWebEngineHttpRequest(QUrl.fromUserInput(url))
equest.setHeader(b"Authorization", b"Basic: %s" % (base64string,))
self.load(request)
app.exec_()
#property
def html(self):
return self._html
def _loadFinished(self):
self.toHtml(self.handle_to_html)
def handle_to_html(self, html):
self._html = html
QApplication.quit()
def main():
url = "https://stackoverflow.com/questions/64055445/scraping-websites-with-protected-content-using-pyqt5/64055601?noredirect=1#comment113272437_64055601"
r = Render(url)
print(r.html)
if __name__ == "__main__":
main()
How can I load the content in <pre> ?
The element with tag "pre" and class "lang-py" is present in the html so you can use BeautifulSoup to get the data:
# ...
from bs4 import BeautifulSoup
# ...
def main():
url = "https://stackoverflow.com/questions/64055445/scraping-websites-with-protected-content-using-pyqt5/64055601?noredirect=1#comment113272437_64055601"
r = Render(url)
soup = BeautifulSoup(r.html, "html.parser")
for tag in soup.find_all("pre", {"class": "lang-py"}):
print("=" * 50)
print(tag.text)
I am trying to scrape this page recursively using BeautifulSoup.
The problem however is that the pdf links actually open a new page on which the pdf's are embedded. In this embedded page we can subsequently find the true pdf links from the embedded tag.
I added therefore a line to check if the content is of the application/pdf. However using the redirect url, I am unable to extract the pdf links from this new page with the embedded pdf.
I tried the following but this did not work (a valid pdf link is never found)
# run the following in a .py file:
# spider = fdb.OurSpider()
# spider.scrape_page(url=url)
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests import get
import time
MAX_DEPTH = 10
class OurSpider:
def __init__(self):
"""Init our Custom Spider"""
def scrape_page(self, url):
"""Scrape page"""
try:
self.download_pdfs(url=url)
except requests.exceptions.MissingSchema:
print(f'skipped MissingSchema [{url}]')
try:
links = self.get_links(url=url)
print(links)
except:
print('')
def download_pdfs(self, url, depth=1):
# If there is no such folder, the script will create one automatically
print('')
print(f'--- [{depth}] {url}')
if depth > MAX_DEPTH:
return 'max depth reached'
soup = self.get_soup(url=url)
links = soup.select("a[href$='.pdf']")
for link in links:
try:
full_url = urljoin(url, link['href'])
content = get(full_url)
if content.status_code == 200 and content.headers['content-type'] == 'application/pdf':
self.download_pdf(full_url=full_url)
elif full_url != url:
self.download_pdfs(url=full_url, depth=depth+1)
else:
print('skipping url')
except requests.exceptions.InvalidSchema:
print(f'skipped InvalidSchema [{link}]')
print('--- downloading pdfs done')
def download_pdf(self, full_url):
"""Download single url"""
filename = "".join(['tmp/', str(return round(time.time() * 1000)), '.pdf'])
if not self.file_exists(filename=filename):
print(f'{filename}: {full_url}')
with open(filename, 'wb') as f:
f.write(requests.get(full_url).content)
def get_links(self, url):
"""Get the links given the url"""
soup = self.get_soup(url=url)
return soup.findAll('a', href=True)
#staticmethod
def file_exists(filename):
"""File exists locally"""
return os.path.exists(filename)
#staticmethod
def get_soup(url):
"""Init the url"""
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
return soup
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from urllib.parse import unquote
site = "https://www.masked.com/us/individual/resources/regulatory-documents/mutual-funds"
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [f"{url[:25]}{item.get('href')}"
for item in soup.findAll("a", title="Annual Report")]
return target
def parse(url):
with requests.Session() as req:
r = req.get(url)
match = [unquote(f"{r.url[:25]}{match.group(1)}") for match in re.finditer(
r"Override=(.+?)\"", r.text)]
return match
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(parse, url) for url in main(site)]
links = []
for future in futures:
links.extend(future.result())
print(f"Collected {len(links)}")
def download(url):
with requests.Session() as req:
r = req.get(url)
if r.status_code == 200 and r.headers['Content-Type'] == "application/pdf;charset=UTF-8":
name = r.url.rfind("/") + 1
name = r.url[name:]
return f"Saving {name}"
with open(f"{name}", 'wb') as f:
f.write(r.content)
else:
pass
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(download, url) for url in links]
for future in as_completed(futures):
print(future.result())
I'm trying to get some datas from a javascript webpage. My code is generating multiple links and parsing them one by one. Parsing outputs are lists. I have written this code with help from here. But it produces the lists inside a class. I want to insert list items into an sqlite table, and because of this I want to make the local list items global. I've tried to create a global list, put it into the class, and then append to it and return it. I've tried to directly insert them into the database from the processCurrentPage method and tried to create a list under the class and reach it by Webpage.list. But none of these methods worked. One of my attempts is here, but not the best one - it's only an example. I've tried many alternatives like this. Can you suggest a good way to handle it please?
P.S: I am new at Python, but researching it for whole two days, and read all class documentation, but couldn't find a way.
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
import requests
from bs4 import BeautifulSoup
import bs4 as bs
class WebPage(QtWebEngineWidgets.QWebEnginePage):
alldatas=[]
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext
#property
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
url = self.url().toString()
# do stuff with html...
soup = bs.BeautifulSoup(html, 'html.parser')
data = soup.find('div', class_='tablo_dual_board')
data1 = data.text
data2 = data1.splitlines()
self.alldatas+=data2
if not self.fetchNext:
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, QWebEnginePage_JavaScriptConsoleMessageLevel, p_str, p_int, p_str_1):
# disable javascript error output
pass
if __name__ == '__main__':
# generate some test urls
onexurl = "https://1xbahis1.com/en/live/Football/"
r = requests.get(onexurl)
soup = BeautifulSoup(r.content, "html.parser")
income = soup.find_all("ul", {"id":"games_content"})
links = soup.find_all("a", {"class": "c-events__name"})
urls = []
for matchlink in links:
urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
print(webpage.alldatas)
sys.exit(app.exec_())
Below is a version of your script that should do what you want. The scrape_page function is called for each url that is processed, and the data is added to a global records list. The process_records function is called once after all the pages have been scraped. You can use this function to add the records to your database.
import sys
import requests
from bs4 import BeautifulSoup
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
records = []
def scrape_page(url, html):
print('scrape page:', url)
soup = BeautifulSoup(html, 'html.parser')
data = soup.find('div', class_='tablo_dual_board')
if data is not None:
records.append(data.text.splitlines())
else:
print('error: could not find tablo_dual_board')
def process_records():
# add record to database ...
print('process records:', len(records))
def generate_urls():
onexurl = "https://1xbahis1.com/en/live/Football/"
reply = requests.get(onexurl)
soup = BeautifulSoup(reply.content, "html.parser")
income = soup.find_all("ul", {"id":"games_content"})
links = soup.find_all("a", {"class": "c-events__name"})
urls = []
for matchlink in links:
urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))
return urls
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
scrape_page(self.url().toString(), html)
if not self.fetchNext():
process_records()
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, QWebEnginePage_JavaScriptConsoleMessageLevel, p_str, p_int, p_str_1):
# disable javascript error output
pass
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(generate_urls())
sys.exit(app.exec_())
Could someone please explain to me why I get a blank return when running this code? I am simply trying to print the contents of an html tag using beautiful soup. Code is below.
Thanks
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
def stats():
url = 'https://www.flashscore.com.au/football/usa/mls/results/'
response = http.request('GET', url)
soup = BeautifulSoup(response.data,'lxml')
right_table=soup.find('div',{'class':'fs-table tournament-page'})
print(right_table.text)
stats()
You can fetch and process multiple URLs using PyQT5 as you ask in your comment like this:
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWebKit import *
from PyQt5.QtWebKitWidgets import QWebPage
from PyQt5.QtWidgets import QApplication
import bs4 as bs
import sys
class Render(QWebPage):
def __init__(self):
super(Render, self).__init__()
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
print (self.mainFrame().url().toString())
result = self.mainFrame().toHtml()
soup = bs.BeautifulSoup(result, 'lxml')
right_table = soup.find('div', {'class': 'fs-table tournament-page'})
print(right_table.text)
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
app.quit()
if __name__ == '__main__':
urls = ["https://www.flashscore.com.au/football/usa/mls/results/", "https://www.flashscore.com.au/football/usa/mls/fixtures/"]
app = QApplication(sys.argv)
renderer = Render()
renderer.start(urls)
sys.exit(app.exec_())