How can I get output from method of class into global variable - python

I'm trying to get some datas from a javascript webpage. My code is generating multiple links and parsing them one by one. Parsing outputs are lists. I have written this code with help from here. But it produces the lists inside a class. I want to insert list items into an sqlite table, and because of this I want to make the local list items global. I've tried to create a global list, put it into the class, and then append to it and return it. I've tried to directly insert them into the database from the processCurrentPage method and tried to create a list under the class and reach it by Webpage.list. But none of these methods worked. One of my attempts is here, but not the best one - it's only an example. I've tried many alternatives like this. Can you suggest a good way to handle it please?
P.S: I am new at Python, but researching it for whole two days, and read all class documentation, but couldn't find a way.
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
import requests
from bs4 import BeautifulSoup
import bs4 as bs
class WebPage(QtWebEngineWidgets.QWebEnginePage):
alldatas=[]
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext
#property
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
url = self.url().toString()
# do stuff with html...
soup = bs.BeautifulSoup(html, 'html.parser')
data = soup.find('div', class_='tablo_dual_board')
data1 = data.text
data2 = data1.splitlines()
self.alldatas+=data2
if not self.fetchNext:
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, QWebEnginePage_JavaScriptConsoleMessageLevel, p_str, p_int, p_str_1):
# disable javascript error output
pass
if __name__ == '__main__':
# generate some test urls
onexurl = "https://1xbahis1.com/en/live/Football/"
r = requests.get(onexurl)
soup = BeautifulSoup(r.content, "html.parser")
income = soup.find_all("ul", {"id":"games_content"})
links = soup.find_all("a", {"class": "c-events__name"})
urls = []
for matchlink in links:
urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
print(webpage.alldatas)
sys.exit(app.exec_())

Below is a version of your script that should do what you want. The scrape_page function is called for each url that is processed, and the data is added to a global records list. The process_records function is called once after all the pages have been scraped. You can use this function to add the records to your database.
import sys
import requests
from bs4 import BeautifulSoup
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
records = []
def scrape_page(url, html):
print('scrape page:', url)
soup = BeautifulSoup(html, 'html.parser')
data = soup.find('div', class_='tablo_dual_board')
if data is not None:
records.append(data.text.splitlines())
else:
print('error: could not find tablo_dual_board')
def process_records():
# add record to database ...
print('process records:', len(records))
def generate_urls():
onexurl = "https://1xbahis1.com/en/live/Football/"
reply = requests.get(onexurl)
soup = BeautifulSoup(reply.content, "html.parser")
income = soup.find_all("ul", {"id":"games_content"})
links = soup.find_all("a", {"class": "c-events__name"})
urls = []
for matchlink in links:
urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))
return urls
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
scrape_page(self.url().toString(), html)
if not self.fetchNext():
process_records()
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, QWebEnginePage_JavaScriptConsoleMessageLevel, p_str, p_int, p_str_1):
# disable javascript error output
pass
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(generate_urls())
sys.exit(app.exec_())

Related

Retrieving Dynamic Webpage Content PyQt5

After being able to login into a protected website, I'd like the scrape some of the content of thesame webpage that is being loading dynamically. This code block handles the authentication properly but if I try to access a pre tag element with class name lang-py I get None return to me as the output.
import sys
from PyQt5.QtCore import QByteArray, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineCore import QWebEngineHttpRequest
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class Render(QWebEnginePage):
def __init__(self, url):
app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self._html = ""
username = "username"
password = "password"
base64string = QByteArray(("%s:%s" % (username, password)).encode()).toBase64()
request = QWebEngineHttpRequest(QUrl.fromUserInput(url))
equest.setHeader(b"Authorization", b"Basic: %s" % (base64string,))
self.load(request)
app.exec_()
#property
def html(self):
return self._html
def _loadFinished(self):
self.toHtml(self.handle_to_html)
def handle_to_html(self, html):
self._html = html
QApplication.quit()
def main():
url = "https://stackoverflow.com/questions/64055445/scraping-websites-with-protected-content-using-pyqt5/64055601?noredirect=1#comment113272437_64055601"
r = Render(url)
print(r.html)
if __name__ == "__main__":
main()
How can I load the content in <pre> ?
The element with tag "pre" and class "lang-py" is present in the html so you can use BeautifulSoup to get the data:
# ...
from bs4 import BeautifulSoup
# ...
def main():
url = "https://stackoverflow.com/questions/64055445/scraping-websites-with-protected-content-using-pyqt5/64055601?noredirect=1#comment113272437_64055601"
r = Render(url)
soup = BeautifulSoup(r.html, "html.parser")
for tag in soup.find_all("pre", {"class": "lang-py"}):
print("=" * 50)
print(tag.text)

How to scrape pdf's that are embedded with BeautifulSoup

I am trying to scrape this page recursively using BeautifulSoup.
The problem however is that the pdf links actually open a new page on which the pdf's are embedded. In this embedded page we can subsequently find the true pdf links from the embedded tag.
I added therefore a line to check if the content is of the application/pdf. However using the redirect url, I am unable to extract the pdf links from this new page with the embedded pdf.
I tried the following but this did not work (a valid pdf link is never found)
# run the following in a .py file:
# spider = fdb.OurSpider()
# spider.scrape_page(url=url)
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests import get
import time
MAX_DEPTH = 10
class OurSpider:
def __init__(self):
"""Init our Custom Spider"""
def scrape_page(self, url):
"""Scrape page"""
try:
self.download_pdfs(url=url)
except requests.exceptions.MissingSchema:
print(f'skipped MissingSchema [{url}]')
try:
links = self.get_links(url=url)
print(links)
except:
print('')
def download_pdfs(self, url, depth=1):
# If there is no such folder, the script will create one automatically
print('')
print(f'--- [{depth}] {url}')
if depth > MAX_DEPTH:
return 'max depth reached'
soup = self.get_soup(url=url)
links = soup.select("a[href$='.pdf']")
for link in links:
try:
full_url = urljoin(url, link['href'])
content = get(full_url)
if content.status_code == 200 and content.headers['content-type'] == 'application/pdf':
self.download_pdf(full_url=full_url)
elif full_url != url:
self.download_pdfs(url=full_url, depth=depth+1)
else:
print('skipping url')
except requests.exceptions.InvalidSchema:
print(f'skipped InvalidSchema [{link}]')
print('--- downloading pdfs done')
def download_pdf(self, full_url):
"""Download single url"""
filename = "".join(['tmp/', str(return round(time.time() * 1000)), '.pdf'])
if not self.file_exists(filename=filename):
print(f'{filename}: {full_url}')
with open(filename, 'wb') as f:
f.write(requests.get(full_url).content)
def get_links(self, url):
"""Get the links given the url"""
soup = self.get_soup(url=url)
return soup.findAll('a', href=True)
#staticmethod
def file_exists(filename):
"""File exists locally"""
return os.path.exists(filename)
#staticmethod
def get_soup(url):
"""Init the url"""
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
return soup
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from urllib.parse import unquote
site = "https://www.masked.com/us/individual/resources/regulatory-documents/mutual-funds"
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [f"{url[:25]}{item.get('href')}"
for item in soup.findAll("a", title="Annual Report")]
return target
def parse(url):
with requests.Session() as req:
r = req.get(url)
match = [unquote(f"{r.url[:25]}{match.group(1)}") for match in re.finditer(
r"Override=(.+?)\"", r.text)]
return match
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(parse, url) for url in main(site)]
links = []
for future in futures:
links.extend(future.result())
print(f"Collected {len(links)}")
def download(url):
with requests.Session() as req:
r = req.get(url)
if r.status_code == 200 and r.headers['Content-Type'] == "application/pdf;charset=UTF-8":
name = r.url.rfind("/") + 1
name = r.url[name:]
return f"Saving {name}"
with open(f"{name}", 'wb') as f:
f.write(r.content)
else:
pass
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(download, url) for url in links]
for future in as_completed(futures):
print(future.result())

Web Scraping - blank return

Could someone please explain to me why I get a blank return when running this code? I am simply trying to print the contents of an html tag using beautiful soup. Code is below.
Thanks
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
http = urllib3.PoolManager()
def stats():
url = 'https://www.flashscore.com.au/football/usa/mls/results/'
response = http.request('GET', url)
soup = BeautifulSoup(response.data,'lxml')
right_table=soup.find('div',{'class':'fs-table tournament-page'})
print(right_table.text)
stats()
You can fetch and process multiple URLs using PyQT5 as you ask in your comment like this:
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWebKit import *
from PyQt5.QtWebKitWidgets import QWebPage
from PyQt5.QtWidgets import QApplication
import bs4 as bs
import sys
class Render(QWebPage):
def __init__(self):
super(Render, self).__init__()
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
print (self.mainFrame().url().toString())
result = self.mainFrame().toHtml()
soup = bs.BeautifulSoup(result, 'lxml')
right_table = soup.find('div', {'class': 'fs-table tournament-page'})
print(right_table.text)
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
app.quit()
if __name__ == '__main__':
urls = ["https://www.flashscore.com.au/football/usa/mls/results/", "https://www.flashscore.com.au/football/usa/mls/fixtures/"]
app = QApplication(sys.argv)
renderer = Render()
renderer.start(urls)
sys.exit(app.exec_())

Class crawler written in python throws attribute error

After writing some code in python, I've got stuck in deep trouble. I'm a newbie in writing code following the OOP design in python. The xpaths I've used in my code are flawless. I'm getting lost when it comes to run the "passing_links" method in my "info_grabber" class through the instance of "page_crawler" class. Every time I run my code I get an error "'page_crawler' object has no attribute 'passing_links'". Perhaps the way I've written my class-crawler is not how it should be. However, as I've spent few hours on it so I suppose I might get any suggestion as to which lines I should rectify to make it work. Thanks in advance for taking a look into it:
from lxml import html
import requests
class page_crawler(object):
main_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
base_link = "https://www.yellowpages.com"
def __init__(self):
self.links = [self.main_link]
def crawler(self):
for link in self.links:
self.get_link(link)
def get_link(self, link):
print("Running page "+ link)
page = requests.get(link)
tree = html.fromstring(page.text)
item_links = tree.xpath('//h2[#class="n"]/a[#class="business-name"][not(#itemprop="name")]/#href')
for item_link in item_links:
return self.base_link + item_link
links = tree.xpath('//div[#class="pagination"]//li/a/#href')
for url in links:
if not self.base_link + url in self.links:
self.links += [self.base_link + url]
class Info_grabber(page_crawler):
def __init__(self, plinks):
page_crawler.__init__(self)
self.plinks = [plinks]
def passing_links(self):
for nlink in self.plinks:
print(nlink)
self.crawling_deep(nlink)
def crawling_deep(self, uurl):
page = requests.get(uurl)
tree = html.fromstring(page.text)
name = tree.findtext('.//div[#class="sales-info"]/h1')
phone = tree.findtext('.//p[#class="phone"]')
try:
email = tree.xpath('//div[#class="business-card-footer"]/a[#class="email-business"]/#href')[0]
except IndexError:
email=""
print(name, phone, email)
if __name__ == '__main__':
crawl = Info_grabber(page_crawler)
crawl.crawler()
crawl.passing_links()
Now upon execution I get a new error "raise MissingSchema(error)" when it hits the line "self.crawling_deep(nlink)"
I'm not sure i understand what you're trying to do in page_crawler.get_link, but i think you should have a different method for collecting "pagination" links.
I renamed Info_grabber.plinks to Info_grabber.links so that the page_crawler.crawler can access them, and managed to extract info from several pages, however the code is far from ideal.
class page_crawler(object):
main_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
base_link = "https://www.yellowpages.com"
def __init__(self):
self.links = []
self.pages = []
def crawler(self):
for link in self.links:
self.get_link(link)
def get_link(self, link):
print("Running page "+ link)
page = requests.get(link)
tree = html.fromstring(page.text)
item_links = tree.xpath('//h2[#class="n"]/a[#class="business-name"][not(#itemprop="name")]/#href')
for item_link in item_links:
if not self.base_link + item_link in self.links:
self.links += [self.base_link + item_link]
def get_pages(self, link):
page = requests.get(link)
tree = html.fromstring(page.text)
links = tree.xpath('//div[#class="pagination"]//li/a/#href')
for url in links:
if not self.base_link + url in self.pages:
self.pages += [self.base_link + url]
class Info_grabber(page_crawler):
def __init__(self, plinks):
page_crawler.__init__(self)
self.links += [plinks]
def passing_links(self):
for nlink in self.links:
print(nlink)
self.crawling_deep(nlink)
def crawling_deep(self, uurl):
page = requests.get(uurl)
tree = html.fromstring(page.text)
name = tree.findtext('.//div[#class="sales-info"]/h1')
phone = tree.findtext('.//p[#class="phone"]')
try:
email = tree.xpath('//div[#class="business-card-footer"]/a[#class="email-business"]/#href')[0]
except IndexError:
email=""
print(name, phone, email)
if __name__ == '__main__':
url = page_crawler.main_link
crawl = Info_grabber(url)
crawl.crawler()
crawl.passing_links()
You'll notice that i added a pages property and a get_pages method in page_crawler, i'll leave the implementation part to you.
You might need to add more methods to page_crawler later on, as they could be of use if you develop more child classes. Finally consider looking into composition as it is also a strong OOP feature.
Your crawl is an instance of the page crawler class, but not the InfoGrabber class, which is the class that has the method passing_links. I think what you want to do is make crawl an instance of InfoGrabber instead.
Then I believe before doing self.crawling_deep you must do:
if n_link:
page = requests.get(n_link).text
tel = re.findall(r'\d{10}', page)[0] if re.findall(r'\d{10}', page) else ""
print(tel)

BeautifulSoup findAll HTML class with multiple variable class inputs

I have the following code which scrapes a website for divs with the class "odd" or "even". I'd like to make "odd" and "even" an argument my function takes in, which would allow me to add other divs as well. Here is my code:
#
# Imports
#
import urllib2
from bs4 import BeautifulSoup
import re
import os
from pprint import pprint
#
# library
#
def get_soup(url):
page = urllib2.urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, "html.parser")
body = soup.findAll("tr", ["even", "odd"])
string_list = str([i for i in body])
return string_list
def save_to_file(path, soup):
with open(path, 'w') as fhandle:
fhandle.write(soup)
#
# script
#
def main():
url = r'URL GOES HERE'
path = os.path.join('PATH GOES HERE')
the_soup = get_soup(url)
save_to_file(path, the_soup)
if __name__ == '__main__':
main()
I'd like to incorporate *args into the code so the get_soup function would look like this:
def get_soup(url, *args):
page = urllib2.urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, "html.parser")
body = soup.findAll("tr", [args])
string_list = str([i for i in body])
return string_list
def main():
url = r'URL GOES HERE'
path = os.path.join('PATH GOES HERE')
the_soup = get_soup(url, "odd", "even")
save_to_file(path, the_soup)
Unfortunately, this isn't working. Ideas?
Don't put args in a list, args is already a tuple so just pass that:
body = soup.findAll("tr", args)
If you [args], you would end up with something like [("odd","even")].
Also str([i for i in body]) makes no real sense, it would be the same as just doing str(body) but I don't see how that format could be useful.

Categories

Resources