Unable to make a bridge between two classes - python

I've written some code in python and my intention is to supply the newly produced links by "web_parser" class to the "get_docs" class. However, I can't think of anything productive to do so. All I wanna do is bridge a connection between the two classes so that the "web_parser" class produce links and the "get_docs" class process them to get the refined output. Any idea as to how I can do it flawlessly will be highly appreciated. Thanks in advance.
from lxml import html
import requests
class web_parser:
page_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
main_url = "https://www.yellowpages.com"
def __init__(self, link):
self.link = link
self.vault = []
def parser(self):
self.get_link(self.page_link)
def get_link(self, url):
page = requests.get(url)
tree = html.fromstring(page.text)
item_links = tree.xpath('//h2[#class="n"]/a[#class="business-name"][not(#itemprop="name")]/#href')
for item_link in item_links:
self.vault.append(self.main_url + item_link)
class get_docs(web_parser):
def __init__(self, new_links):
web_parser.__init__(self, link)
self.new_links = [new_links]
def procuring_links(self):
for link in self.vault:
self.using_links(link)
def using_links(self, newly_created_link):
page = requests.get(newly_created_link)
tree = html.fromstring(page.text)
name = tree.findtext('.//div[#class="sales-info"]/h1')
phone = tree.findtext('.//p[#class="phone"]')
print(name, phone)
if __name__ == '__main__':
crawl = web_parser(web_parser.page_link)
parse = get_docs(crawl)
parse.parser()
parse.procuring_links()
I know a very little about creating classes so please forgive my ignorance. Upon execution at this stage I get an error:
web_parser.__init__(self, link)
NameError: name 'link' is not defined

I'm not very sure how you want to use it, by giving a parameter to web_parser or use an hardcoded link inside the class ?
From the commands you are using in __main__, you could process like below:
class get_docs(object):
def __init__(self, web_parser):
self.vault = web_parser.vault
if __name__ == '__main__':
crawl = web_parser() # create an instance
crawl.parser()
parse = get_docs(crawl) # give the instance to get_doc, or directly the vault with crawl.vault
parse.procuring_links() # execute get_doc processing
__
You'll need to correct the web_parser class too:
you have to choose between a parameter given during creation (link), or the hardcoded page_link, just adapt the method parser() to target the good one.
class web_parser:
def __init__(self, link=''):
self.link = link
self.vault = []
self.page_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
self.main_url = "https://www.yellowpages.com"

To fix the NameError you posted in your question, you need to add another parameter to __init__ of your subclass - and pass something to it.
class get_docs(web_parser):
#def __init__(self, new_links):
def __init__(self, link, new_links):
web_parser.__init__(self, link)
self.new_links = [new_links]
Although web_parser doesn't seem to do anything with that data so maybe just remove it from the base class.

Related

How to search for a particular data in the url using python

Hello i'm working in something and i have got stucked in a script whose first method takes a string argument and do a google search on it the it moves into in the second method which is takes the result of the website
for ex:
from googlesearch import search
class online():
def __init__(self):
self.search = ""
def get_url(self):
for i in search(self.search,num_resuls=10):
return i
def dump_data(self):
pass
so let's say that the search = how to make money?
it will give me the link to an article
so how can i dump that data which answer this question in the url?
and the search() method from googlesearch module only returns 1 url how can i fix that?
If all you want is to iterate over the top 10 search results, then you can simply return the generator that search() returns. Something like this:
from googlesearch import search
class Google(object):
def __init__(self, terms):
self.terms = terms
def get_urls(self):
return search(self.terms, num_results=10)
def dump_data(self):
pass
for url in Google("how to make money?").get_urls():
print(url)
Note it's a bad idea to name your class attributes the same name (e.g. "search") as the function you imported.

How to split code into different python files

I have been working on an I/O bound application where I will run multiple scripts at the same time depending on the args I will call for a script etc: monitor.py --s="sydsvenskan", monitor.py -ss="bbc" etc etc.
from __future__ import annotations
from abc import abstractmethod
from typing import ClassVar, Dict
from typing import Optional
import attr
import requests
from selectolax.parser import HTMLParser
#attr.dataclass
class Info:
"""Scraped info about news"""
all_articles: set = attr.ib(factory=set)
store: str = attr.ib(factory=str)
name: Optional[str] = attr.ib(factory=str)
image: Optional[str] = attr.ib(factory=str)
class Scraper:
scrapers: ClassVar[Dict[str, Scraper]] = {}
domain: ClassVar[str]
def __init_subclass__(cls) -> None:
Scraper.scrapers[cls.domain] = cls
#classmethod
def for_url(cls, domain, url) -> Scraper:
return cls.scrapers[domain](url)
#abstractmethod
def scrape_feed(self):
pass
#abstractmethod
def scrape_product(self):
pass
class BBCScraper(Scraper):
domain = 'BBC'
def __init__(self, url):
self.url = url
def scrape_feed(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
all_articles = {
f"https://www.BBC.se{product_link.attrs['href']}" for product_link in
doc.css('td.search-productnamne > a, div.product-image > a')
}
return Info(
store="BBC",
all_articles=all_articles
)
def scrape_product(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
# FIXME Scrape valid webelements
name = "Test"
image = "Test"
return Info(
store="BBC",
name=name,
image=image,
)
class SydsvenskanScraper(Scraper):
domain = 'Sydsvenskan'
def __init__(self, url):
self.url = url
def scrape_feed(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
all_articles = {
f"https://Sydsvenskan.se/{product_link.attrs['href']}" for product_link in
doc.css('div.product-image > a, td.search-productnamne > a')
}
return Info(
store="Sydsvenskan",
all_articles=all_articles
)
def scrape_product(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
# FIXME Scrape valid webelements
name = "Test"
image = "Test"
return Info(
store="Sydsvenskan",
name=name,
image=image,
)
if __name__ == "__main__":
#FIXME Use arguments instead
domain = 'BBC'
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(domain, url)
r = scraper.scrape_feed()
print(r)
As you can currently see I have "hardcoded":
domain = 'BBC'
url = 'https://www.bbc.co.uk/'
which will currently be passed through arguments instead.
However as we can see if I start to add more "stores/news sites" in the class Scraper e.g. 40 different site, it would be pretty hard to navigate to the correct code if you want to maintain or do any changes.
I wonder how I can in that case split the code into different files where etc Sydsvenska will be for itself and BBC will be by itself. I can then easier maintain the code in the future if there will be any changes.
Ok I understand what you're looking for. And sorry to say you're out of luck. At least as far as my knowledge of python goes. You can do it two ways.
Use importlib to search through a folder/package tha contains those files and imports them into a list or dict to be retrieved. However you said you wanted to avoid this but either way you would have to use importlib. And #2 is the reason why.
Use a Base class that when inherited it's __init__ call adds the Derived class to a list or object that stores it and you can retrieve it via a class object. However the issue here is that if you move your derived class into a new file, that code wont run until you import it. So you would still need to explicitly import the file or implicitly import it via importlib (dynamic import).
So you'll have to use importlib (dynamic import) either way.

How to pick up the correct class (NameError)

I have been working on a project where I want to gather the urls and then I could just import all the modules with the scraper classes and it should register all of them into the list.
I have currently done:
import sys
import tldextract
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.url] = scraper_class # .url -> Unresolved attribute reference 'url' for class 'Scraper'
#classmethod
def for_url(cls, url):
k = tldextract.extract(url)
return scrapers[k.domain]() #<-- Unresolved reference 'scrapers'
class BBCScraper(Scraper):
url = 'bbc.co.uk'
def scrape(s):
print(s)
# FIXME Scrape the correct values for BBC
return "Yay works!"
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(url)
scraper.scrape("yay")
My currently problem right now is that I am not able to continue to execute the code as I am not able to return scrapers[k.domain]()
Output >>> NameError: name 'scrapers' is not defined
I wonder how I can pick up the correct class as for exaple if the URL is the bbc, it should g into the BBCScraper class and then we call the scrape which later on will return the values that has been scraped on that specific website
Do as you did in __init_subclass__ or use cls.scrapers.
#classmethod
def for_url(cls, url):
k = tldextract.extract(url)
return Scraper.scrapers[k.domain]()
# or
return cls.scrapers[k.domain]()
As for the second issue
Please ask that in a separate question
Please explain better what exactly you are trying to do

Unable to use session within a classmethod of a web-scraper

I've created a python script using classmethod to fetch the profilename after loging in inputting the credentials in a webpage. The script is able to fetch the profilename in the right way. What I wish to do now is use session within classmethod. The session has already been defined within __init__() method. I would like to keep the existing design intact.
This is what I've tried so far:
import requests
from bs4 import BeautifulSoup
class StackOverflow:
SEARCH_URL = "https://stackoverflow.com/users/login?ssrc=head&returnurl=https%3a%2f%2fstackoverflow.com%2f"
def __init__(self,session):
self.session = session
#classmethod
def crawl(cls,email,password):
page = requests.get(cls.SEARCH_URL,headers={"User-Agent":"Mozilla/5.0"})
sauce = BeautifulSoup(page.text, "lxml")
fkey = sauce.select_one("[name='fkey']")["value"]
payload = {"fkey": fkey,"email": email,"password": password,}
res = requests.post(cls.SEARCH_URL,data=payload,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text, "lxml")
user = soup.select_one("div[class^='gravatar-wrapper-']").get("title")
yield user
if __name__ == '__main__':
with requests.Session() as s:
result = StackOverflow(s)
for item in result.crawl("email", "password"):
print(item)
How can I use session taking from __init__ within classmethod?
You can't access self.session from a class method. Method __init__ is called when an instance of the class is created, however class methods are not bound to any particular instance of the class, but the class itself - that's why the first parameter is usually cls and not self.
You decided to create the session in the __init__, so it can be assumed that
so1 = StackOverflow()
so2 = StackOverflow()
keep their sessions separate. If that is indeed your intention, the crawl method should not be annotated with #classmethod. If you have crawl(self, email, pass): then you will still be able to use StackOverflow.SEARCH_URL and self.__class__.SEARCH_URL to get the value defined in StackOverflow class, or self.SEARCH_URL which will by default get the same value, but could be changed with so1.SEARCH_URL = "sth else" (but so2.SEARCH_URL would keep it's original value)

Creating a Python classmethod

I've familiarized myself with the concept, most notably by watching Raymond Hettinger's excellent video and reading the accepted answer here and I am wondering what I got wrong.
class ReadHTML(object):
def __init__(self, url):
page = urlopen(url).read()
self.page = page
#classmethod
def from_file(cls, path):
page = open(path).read()
return cls(page)
This works
r = ReadHTML('http://example.com')
print r.page
and this is not
r = ReadHTML.from_file('example.html')
print r.page
it throws me an error, as if I was trying to "urlopen" a file:
File "/usr/lib/python2.7/urllib2.py", line 258, in get_type
raise ValueError, "unknown url type: %s" % self.__original
ValueError: unknown url type: <!doctype html>
Can you see what's wrong?
You are still calling the class initializer, ReadHTML.__init__(), when you call cls(page); that call is no different from calling ReadHTML(page), you are just using a different reference. This method only accepts a url parameter and the code passes that to urlopen() regardless.
Adjust your ReadHTML.__init__() method to handle being passed a page instead of a URL:
class ReadHTML(object):
def __init__(self, url=None, page=None):
if url is not None:
page = urlopen(url).read()
self.page = page
#classmethod
def from_file(cls, path):
page = open(path).read()
return cls(page=page)
Now the code supports both paths to produce an instance.
from_file opens the page, but so does your __init__() constructor, so if you do ReadHTML.from_file('example.html'), you are essentially doing:
page = urlopen(open('example.html').read()).read()
Personally, I prefer Martijn's solution, for semantic clarity, but here is an alternative:
class ReadHTML(object):
def __init__(self, url, opener=urlopen):
self.page = opener(url).read()
#classmethod
def from_file(cls, path):
return cls(path, opener=open)
This solution is advantageous because it gives you the capability of defining arbitrary openers (say, for opening files stored in a database).
I'm not a big fan of optional parameters overriding each other. I would make it so that the default constructor accepts a string and I would have to separate alternate constructors to handle a filename and an URL.
I also modified the filename constructor to explicitly close the file.
class ReadHTML(object):
def __init__(self, page):
self.page = page
#classmethod
def from_filename(cls, path):
with open(path) as f:
page = f.read()
return cls(page)
#classmethod
def from_url(cls, url):
page = urlopen(url).read()
return cls(page)
As a side note, I believe urllib/urllib2 support file://, so you would strictly not need the filename constructor (but I still believe it is nice to have).

Categories

Resources