How to pick up the correct class (NameError) - python

I have been working on a project where I want to gather the urls and then I could just import all the modules with the scraper classes and it should register all of them into the list.
I have currently done:
import sys
import tldextract
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.url] = scraper_class # .url -> Unresolved attribute reference 'url' for class 'Scraper'
#classmethod
def for_url(cls, url):
k = tldextract.extract(url)
return scrapers[k.domain]() #<-- Unresolved reference 'scrapers'
class BBCScraper(Scraper):
url = 'bbc.co.uk'
def scrape(s):
print(s)
# FIXME Scrape the correct values for BBC
return "Yay works!"
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(url)
scraper.scrape("yay")
My currently problem right now is that I am not able to continue to execute the code as I am not able to return scrapers[k.domain]()
Output >>> NameError: name 'scrapers' is not defined
I wonder how I can pick up the correct class as for exaple if the URL is the bbc, it should g into the BBCScraper class and then we call the scrape which later on will return the values that has been scraped on that specific website

Do as you did in __init_subclass__ or use cls.scrapers.
#classmethod
def for_url(cls, url):
k = tldextract.extract(url)
return Scraper.scrapers[k.domain]()
# or
return cls.scrapers[k.domain]()
As for the second issue
Please ask that in a separate question
Please explain better what exactly you are trying to do

Related

How to search for a particular data in the url using python

Hello i'm working in something and i have got stucked in a script whose first method takes a string argument and do a google search on it the it moves into in the second method which is takes the result of the website
for ex:
from googlesearch import search
class online():
def __init__(self):
self.search = ""
def get_url(self):
for i in search(self.search,num_resuls=10):
return i
def dump_data(self):
pass
so let's say that the search = how to make money?
it will give me the link to an article
so how can i dump that data which answer this question in the url?
and the search() method from googlesearch module only returns 1 url how can i fix that?
If all you want is to iterate over the top 10 search results, then you can simply return the generator that search() returns. Something like this:
from googlesearch import search
class Google(object):
def __init__(self, terms):
self.terms = terms
def get_urls(self):
return search(self.terms, num_results=10)
def dump_data(self):
pass
for url in Google("how to make money?").get_urls():
print(url)
Note it's a bad idea to name your class attributes the same name (e.g. "search") as the function you imported.

How to split code into different python files

I have been working on an I/O bound application where I will run multiple scripts at the same time depending on the args I will call for a script etc: monitor.py --s="sydsvenskan", monitor.py -ss="bbc" etc etc.
from __future__ import annotations
from abc import abstractmethod
from typing import ClassVar, Dict
from typing import Optional
import attr
import requests
from selectolax.parser import HTMLParser
#attr.dataclass
class Info:
"""Scraped info about news"""
all_articles: set = attr.ib(factory=set)
store: str = attr.ib(factory=str)
name: Optional[str] = attr.ib(factory=str)
image: Optional[str] = attr.ib(factory=str)
class Scraper:
scrapers: ClassVar[Dict[str, Scraper]] = {}
domain: ClassVar[str]
def __init_subclass__(cls) -> None:
Scraper.scrapers[cls.domain] = cls
#classmethod
def for_url(cls, domain, url) -> Scraper:
return cls.scrapers[domain](url)
#abstractmethod
def scrape_feed(self):
pass
#abstractmethod
def scrape_product(self):
pass
class BBCScraper(Scraper):
domain = 'BBC'
def __init__(self, url):
self.url = url
def scrape_feed(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
all_articles = {
f"https://www.BBC.se{product_link.attrs['href']}" for product_link in
doc.css('td.search-productnamne > a, div.product-image > a')
}
return Info(
store="BBC",
all_articles=all_articles
)
def scrape_product(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
# FIXME Scrape valid webelements
name = "Test"
image = "Test"
return Info(
store="BBC",
name=name,
image=image,
)
class SydsvenskanScraper(Scraper):
domain = 'Sydsvenskan'
def __init__(self, url):
self.url = url
def scrape_feed(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
all_articles = {
f"https://Sydsvenskan.se/{product_link.attrs['href']}" for product_link in
doc.css('div.product-image > a, td.search-productnamne > a')
}
return Info(
store="Sydsvenskan",
all_articles=all_articles
)
def scrape_product(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
# FIXME Scrape valid webelements
name = "Test"
image = "Test"
return Info(
store="Sydsvenskan",
name=name,
image=image,
)
if __name__ == "__main__":
#FIXME Use arguments instead
domain = 'BBC'
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(domain, url)
r = scraper.scrape_feed()
print(r)
As you can currently see I have "hardcoded":
domain = 'BBC'
url = 'https://www.bbc.co.uk/'
which will currently be passed through arguments instead.
However as we can see if I start to add more "stores/news sites" in the class Scraper e.g. 40 different site, it would be pretty hard to navigate to the correct code if you want to maintain or do any changes.
I wonder how I can in that case split the code into different files where etc Sydsvenska will be for itself and BBC will be by itself. I can then easier maintain the code in the future if there will be any changes.
Ok I understand what you're looking for. And sorry to say you're out of luck. At least as far as my knowledge of python goes. You can do it two ways.
Use importlib to search through a folder/package tha contains those files and imports them into a list or dict to be retrieved. However you said you wanted to avoid this but either way you would have to use importlib. And #2 is the reason why.
Use a Base class that when inherited it's __init__ call adds the Derived class to a list or object that stores it and you can retrieve it via a class object. However the issue here is that if you move your derived class into a new file, that code wont run until you import it. So you would still need to explicitly import the file or implicitly import it via importlib (dynamic import).
So you'll have to use importlib (dynamic import) either way.

Unable to use session within a classmethod of a web-scraper

I've created a python script using classmethod to fetch the profilename after loging in inputting the credentials in a webpage. The script is able to fetch the profilename in the right way. What I wish to do now is use session within classmethod. The session has already been defined within __init__() method. I would like to keep the existing design intact.
This is what I've tried so far:
import requests
from bs4 import BeautifulSoup
class StackOverflow:
SEARCH_URL = "https://stackoverflow.com/users/login?ssrc=head&returnurl=https%3a%2f%2fstackoverflow.com%2f"
def __init__(self,session):
self.session = session
#classmethod
def crawl(cls,email,password):
page = requests.get(cls.SEARCH_URL,headers={"User-Agent":"Mozilla/5.0"})
sauce = BeautifulSoup(page.text, "lxml")
fkey = sauce.select_one("[name='fkey']")["value"]
payload = {"fkey": fkey,"email": email,"password": password,}
res = requests.post(cls.SEARCH_URL,data=payload,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text, "lxml")
user = soup.select_one("div[class^='gravatar-wrapper-']").get("title")
yield user
if __name__ == '__main__':
with requests.Session() as s:
result = StackOverflow(s)
for item in result.crawl("email", "password"):
print(item)
How can I use session taking from __init__ within classmethod?
You can't access self.session from a class method. Method __init__ is called when an instance of the class is created, however class methods are not bound to any particular instance of the class, but the class itself - that's why the first parameter is usually cls and not self.
You decided to create the session in the __init__, so it can be assumed that
so1 = StackOverflow()
so2 = StackOverflow()
keep their sessions separate. If that is indeed your intention, the crawl method should not be annotated with #classmethod. If you have crawl(self, email, pass): then you will still be able to use StackOverflow.SEARCH_URL and self.__class__.SEARCH_URL to get the value defined in StackOverflow class, or self.SEARCH_URL which will by default get the same value, but could be changed with so1.SEARCH_URL = "sth else" (but so2.SEARCH_URL would keep it's original value)

Unable to make a bridge between two classes

I've written some code in python and my intention is to supply the newly produced links by "web_parser" class to the "get_docs" class. However, I can't think of anything productive to do so. All I wanna do is bridge a connection between the two classes so that the "web_parser" class produce links and the "get_docs" class process them to get the refined output. Any idea as to how I can do it flawlessly will be highly appreciated. Thanks in advance.
from lxml import html
import requests
class web_parser:
page_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
main_url = "https://www.yellowpages.com"
def __init__(self, link):
self.link = link
self.vault = []
def parser(self):
self.get_link(self.page_link)
def get_link(self, url):
page = requests.get(url)
tree = html.fromstring(page.text)
item_links = tree.xpath('//h2[#class="n"]/a[#class="business-name"][not(#itemprop="name")]/#href')
for item_link in item_links:
self.vault.append(self.main_url + item_link)
class get_docs(web_parser):
def __init__(self, new_links):
web_parser.__init__(self, link)
self.new_links = [new_links]
def procuring_links(self):
for link in self.vault:
self.using_links(link)
def using_links(self, newly_created_link):
page = requests.get(newly_created_link)
tree = html.fromstring(page.text)
name = tree.findtext('.//div[#class="sales-info"]/h1')
phone = tree.findtext('.//p[#class="phone"]')
print(name, phone)
if __name__ == '__main__':
crawl = web_parser(web_parser.page_link)
parse = get_docs(crawl)
parse.parser()
parse.procuring_links()
I know a very little about creating classes so please forgive my ignorance. Upon execution at this stage I get an error:
web_parser.__init__(self, link)
NameError: name 'link' is not defined
I'm not very sure how you want to use it, by giving a parameter to web_parser or use an hardcoded link inside the class ?
From the commands you are using in __main__, you could process like below:
class get_docs(object):
def __init__(self, web_parser):
self.vault = web_parser.vault
if __name__ == '__main__':
crawl = web_parser() # create an instance
crawl.parser()
parse = get_docs(crawl) # give the instance to get_doc, or directly the vault with crawl.vault
parse.procuring_links() # execute get_doc processing
__
You'll need to correct the web_parser class too:
you have to choose between a parameter given during creation (link), or the hardcoded page_link, just adapt the method parser() to target the good one.
class web_parser:
def __init__(self, link=''):
self.link = link
self.vault = []
self.page_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
self.main_url = "https://www.yellowpages.com"
To fix the NameError you posted in your question, you need to add another parameter to __init__ of your subclass - and pass something to it.
class get_docs(web_parser):
#def __init__(self, new_links):
def __init__(self, link, new_links):
web_parser.__init__(self, link)
self.new_links = [new_links]
Although web_parser doesn't seem to do anything with that data so maybe just remove it from the base class.

My web crawler doesn't work with BeautifulSoup

I am trying to make a web crawler using Python. I am borrowing this code from Programming Collective intelligence book by Toby Segaran. Since the code from the book was outdated, I made some necessary changes but still the program doesn't execute as expected. Here is my code:
import urllib
from urllib import request
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import bs4
# Create a list of words to ignore
ignorewords=set(['the','of','to','and','a','in','is','it'])
class crawler:
# Initialize the crawler with the name of database
def __init__(self,dbname):
pass
def __del__(self): pass
def dbcommit(self):
pass
# Auxilliary function for getting an entry id and adding
# it if it's not present
def getentryid(self,table,field,value,createnew=True):
return None
# Index an individual page
def addtoindex(self,url,soup):
print('Indexing %s' % url)
# Extract the text from an HTML page (no tags)
def gettextonly(self,soup):
return None
# Separate the words by any non-whitespace character
def separatewords(self,text):
return None
# Return true if this url is already indexed
def isindexed(self,url):
return False
# Add a link between two pages
def addlinkref(self,urlFrom,urlTo,linkText):
pass
# Starting with a list of pages, do a breadth
# first search to the given depth, indexing pages
# as we go
def crawl(self,pages,depth=2):
pass
# Create the database tables
def createindextables(self):
pass
def crawl(self,pages,depth=2):
for i in range(depth):
newpages=set( )
for page in pages:
try:
c=request.urlopen(page)
except:
print("Could not open %s" % page)
continue
soup=BeautifulSoup(c.read())
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages.add(url)
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit( )
pages=newpages
pagelist=['http://google.com']
#pagelist=['file:///C:/Users/admin/Desktop/abcd.html']
crawler=crawler('')
crawler.crawl(pagelist)
the only output I get is
"Indexing http://google.com"
"Indexing http://google.com"
press any key to continue...
Everytime I put another link in page list I get same output as "Indexing xyz" where xyz is every link I put in pagelist. I also tried making a HTML file with lots of <a> tags but it didn't work too.
The problem is in your line link=soup('a'). If you want to find elements of class 'a', you should use the different methods named 'find_element_by...' (cf bs4 documentation)

Categories

Resources