I've familiarized myself with the concept, most notably by watching Raymond Hettinger's excellent video and reading the accepted answer here and I am wondering what I got wrong.
class ReadHTML(object):
def __init__(self, url):
page = urlopen(url).read()
self.page = page
#classmethod
def from_file(cls, path):
page = open(path).read()
return cls(page)
This works
r = ReadHTML('http://example.com')
print r.page
and this is not
r = ReadHTML.from_file('example.html')
print r.page
it throws me an error, as if I was trying to "urlopen" a file:
File "/usr/lib/python2.7/urllib2.py", line 258, in get_type
raise ValueError, "unknown url type: %s" % self.__original
ValueError: unknown url type: <!doctype html>
Can you see what's wrong?
You are still calling the class initializer, ReadHTML.__init__(), when you call cls(page); that call is no different from calling ReadHTML(page), you are just using a different reference. This method only accepts a url parameter and the code passes that to urlopen() regardless.
Adjust your ReadHTML.__init__() method to handle being passed a page instead of a URL:
class ReadHTML(object):
def __init__(self, url=None, page=None):
if url is not None:
page = urlopen(url).read()
self.page = page
#classmethod
def from_file(cls, path):
page = open(path).read()
return cls(page=page)
Now the code supports both paths to produce an instance.
from_file opens the page, but so does your __init__() constructor, so if you do ReadHTML.from_file('example.html'), you are essentially doing:
page = urlopen(open('example.html').read()).read()
Personally, I prefer Martijn's solution, for semantic clarity, but here is an alternative:
class ReadHTML(object):
def __init__(self, url, opener=urlopen):
self.page = opener(url).read()
#classmethod
def from_file(cls, path):
return cls(path, opener=open)
This solution is advantageous because it gives you the capability of defining arbitrary openers (say, for opening files stored in a database).
I'm not a big fan of optional parameters overriding each other. I would make it so that the default constructor accepts a string and I would have to separate alternate constructors to handle a filename and an URL.
I also modified the filename constructor to explicitly close the file.
class ReadHTML(object):
def __init__(self, page):
self.page = page
#classmethod
def from_filename(cls, path):
with open(path) as f:
page = f.read()
return cls(page)
#classmethod
def from_url(cls, url):
page = urlopen(url).read()
return cls(page)
As a side note, I believe urllib/urllib2 support file://, so you would strictly not need the filename constructor (but I still believe it is nice to have).
Related
Hello i'm working in something and i have got stucked in a script whose first method takes a string argument and do a google search on it the it moves into in the second method which is takes the result of the website
for ex:
from googlesearch import search
class online():
def __init__(self):
self.search = ""
def get_url(self):
for i in search(self.search,num_resuls=10):
return i
def dump_data(self):
pass
so let's say that the search = how to make money?
it will give me the link to an article
so how can i dump that data which answer this question in the url?
and the search() method from googlesearch module only returns 1 url how can i fix that?
If all you want is to iterate over the top 10 search results, then you can simply return the generator that search() returns. Something like this:
from googlesearch import search
class Google(object):
def __init__(self, terms):
self.terms = terms
def get_urls(self):
return search(self.terms, num_results=10)
def dump_data(self):
pass
for url in Google("how to make money?").get_urls():
print(url)
Note it's a bad idea to name your class attributes the same name (e.g. "search") as the function you imported.
I have been working on an I/O bound application where I will run multiple scripts at the same time depending on the args I will call for a script etc: monitor.py --s="sydsvenskan", monitor.py -ss="bbc" etc etc.
from __future__ import annotations
from abc import abstractmethod
from typing import ClassVar, Dict
from typing import Optional
import attr
import requests
from selectolax.parser import HTMLParser
#attr.dataclass
class Info:
"""Scraped info about news"""
all_articles: set = attr.ib(factory=set)
store: str = attr.ib(factory=str)
name: Optional[str] = attr.ib(factory=str)
image: Optional[str] = attr.ib(factory=str)
class Scraper:
scrapers: ClassVar[Dict[str, Scraper]] = {}
domain: ClassVar[str]
def __init_subclass__(cls) -> None:
Scraper.scrapers[cls.domain] = cls
#classmethod
def for_url(cls, domain, url) -> Scraper:
return cls.scrapers[domain](url)
#abstractmethod
def scrape_feed(self):
pass
#abstractmethod
def scrape_product(self):
pass
class BBCScraper(Scraper):
domain = 'BBC'
def __init__(self, url):
self.url = url
def scrape_feed(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
all_articles = {
f"https://www.BBC.se{product_link.attrs['href']}" for product_link in
doc.css('td.search-productnamne > a, div.product-image > a')
}
return Info(
store="BBC",
all_articles=all_articles
)
def scrape_product(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
# FIXME Scrape valid webelements
name = "Test"
image = "Test"
return Info(
store="BBC",
name=name,
image=image,
)
class SydsvenskanScraper(Scraper):
domain = 'Sydsvenskan'
def __init__(self, url):
self.url = url
def scrape_feed(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
all_articles = {
f"https://Sydsvenskan.se/{product_link.attrs['href']}" for product_link in
doc.css('div.product-image > a, td.search-productnamne > a')
}
return Info(
store="Sydsvenskan",
all_articles=all_articles
)
def scrape_product(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
# FIXME Scrape valid webelements
name = "Test"
image = "Test"
return Info(
store="Sydsvenskan",
name=name,
image=image,
)
if __name__ == "__main__":
#FIXME Use arguments instead
domain = 'BBC'
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(domain, url)
r = scraper.scrape_feed()
print(r)
As you can currently see I have "hardcoded":
domain = 'BBC'
url = 'https://www.bbc.co.uk/'
which will currently be passed through arguments instead.
However as we can see if I start to add more "stores/news sites" in the class Scraper e.g. 40 different site, it would be pretty hard to navigate to the correct code if you want to maintain or do any changes.
I wonder how I can in that case split the code into different files where etc Sydsvenska will be for itself and BBC will be by itself. I can then easier maintain the code in the future if there will be any changes.
Ok I understand what you're looking for. And sorry to say you're out of luck. At least as far as my knowledge of python goes. You can do it two ways.
Use importlib to search through a folder/package tha contains those files and imports them into a list or dict to be retrieved. However you said you wanted to avoid this but either way you would have to use importlib. And #2 is the reason why.
Use a Base class that when inherited it's __init__ call adds the Derived class to a list or object that stores it and you can retrieve it via a class object. However the issue here is that if you move your derived class into a new file, that code wont run until you import it. So you would still need to explicitly import the file or implicitly import it via importlib (dynamic import).
So you'll have to use importlib (dynamic import) either way.
The following link allows for a way to render an HTML URL inside a python3 print() function within a Jupyter notebook code cell,
https://github.com/jupyterlab/jupyterlab/issues/7393#issue-510053776,
which defines a custom URL class,
"""URL Wrapper."""
from dataclasses import dataclass
#dataclass(frozen=True)
class Url:
"""Wrapper around a URL string to provide nice display in IPython environments."""
__url: str
def _repr_html_(self):
"""HTML link to this URL."""
return f'{self.__url}' # problem here (*)
def __str__(self):
"""Return the underlying string."""
return self.__url
The commentator notes that one must use str(url()) to achieve the desired result.
Unlike (I think) the now built-in rendering of this, I am trying to use this custom class thus:
linker = lambda my_string: str(Url('https://www.google.com/%s' % my_string))
print('URL for my_string is here',linker('search'))
I would like the linker('search') to render as the string 'search' with the full hyperlink (https://www.google.com/search) behind. The built-in behaviour does not render 'search' but rather the full hyperlink, and I cannot find a way to successfully modify the custom class to do this. At line (*) above I have tried:
return f'{self.__url}'
return f'{"test_text"}'
etc. but so far in vain.
This answer helps a bit but doesn't explicitly use the print function as per my requirements: https://stackoverflow.com/a/43254984/1021819
What am I missing?
This is a bit jank, but seems to work:
class RenderHyperlink(object):
def __init__(self, key, link, *args):
link = link + "/" if not link.endswith("/") else link
for arg in args:
link += arg + "/"
self.__url = "<a href={}>{}</a>".format(link, key)
def __repr__(self):
from IPython.core.display import display, HTML
display(HTML(self.__url))
return "" # hacky way to return a string despite not returning anything
# notice that you can also add other parameters to the link
print(RenderHyperlink("search", "https://www.google.com/search", "hello"))
Output:
The link points to "https://www.google.com/search/hello/"
I wrote a small class for scraping a webpage holding some documents inside folders, all of these being hosted on S3. I converted the response into an XML tree, where I need to clean each elements from the prefix URL.
Here's the code and issues:
import requests
from lxml import etree
class scraper():
def __init__(self, BASE_URL, headers):
self.BASE_URL = BASE_URL
self.headers = headers
self.URL = self.BASE_URL + '?delimiter=/'
def clean_root(self, root):
"Needed to clean the URL prefix in front of each XML element"
for elem in root.getiterator():
elem.tag = etree.QName(elem).localname
return etree.cleanup_namespaces(root)
def get_root_folder_names(self):
"Retrieve the folders"
res = requests.get(self.URL, headers=self.headers)
root = etree.XML(res.content)
print(f"{root}") # prints: "root: <Element {http://s3.amazonaws.com/doc/2016-11-11/}ListBucketResult at 0x8f87b456e441>"
print(f"{self.clean_root(root)}") # prints: "None", where it must prints "<Element ListBucketResult at 0x8f87b456e441>"
call it:
myInstance = scraper(BASE_URL, headers)
myInstance.get_root_folder_names()
If I call clean_tree(root) from the get_root_folder_names function, the result is None as if it was never applied. But root does exist just before the call to this function as it gets correctly printed. I get inspired from here: https://www.kite.com/python/answers/how-to-call-an-instance-method-in-the-same-class-in-python
What am I doing wrong?
I also tried to use the clean_root function without self. but then, when I call it from the get_root_folder_names function, I got NameError: name 'clean_tree' is not defined.
The problem isn’t really about calling functions from other functions. It’s confusing pure functions with those that have side effects.
The function cleanup_namespaces returns None. It modifies the tree, rather than creating a new one (This is like the problem beginners often have with list.sort).
Change the end of the clean_root function to this:
etree.cleanup_namespaces(root)
return root
I've written some code in python and my intention is to supply the newly produced links by "web_parser" class to the "get_docs" class. However, I can't think of anything productive to do so. All I wanna do is bridge a connection between the two classes so that the "web_parser" class produce links and the "get_docs" class process them to get the refined output. Any idea as to how I can do it flawlessly will be highly appreciated. Thanks in advance.
from lxml import html
import requests
class web_parser:
page_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
main_url = "https://www.yellowpages.com"
def __init__(self, link):
self.link = link
self.vault = []
def parser(self):
self.get_link(self.page_link)
def get_link(self, url):
page = requests.get(url)
tree = html.fromstring(page.text)
item_links = tree.xpath('//h2[#class="n"]/a[#class="business-name"][not(#itemprop="name")]/#href')
for item_link in item_links:
self.vault.append(self.main_url + item_link)
class get_docs(web_parser):
def __init__(self, new_links):
web_parser.__init__(self, link)
self.new_links = [new_links]
def procuring_links(self):
for link in self.vault:
self.using_links(link)
def using_links(self, newly_created_link):
page = requests.get(newly_created_link)
tree = html.fromstring(page.text)
name = tree.findtext('.//div[#class="sales-info"]/h1')
phone = tree.findtext('.//p[#class="phone"]')
print(name, phone)
if __name__ == '__main__':
crawl = web_parser(web_parser.page_link)
parse = get_docs(crawl)
parse.parser()
parse.procuring_links()
I know a very little about creating classes so please forgive my ignorance. Upon execution at this stage I get an error:
web_parser.__init__(self, link)
NameError: name 'link' is not defined
I'm not very sure how you want to use it, by giving a parameter to web_parser or use an hardcoded link inside the class ?
From the commands you are using in __main__, you could process like below:
class get_docs(object):
def __init__(self, web_parser):
self.vault = web_parser.vault
if __name__ == '__main__':
crawl = web_parser() # create an instance
crawl.parser()
parse = get_docs(crawl) # give the instance to get_doc, or directly the vault with crawl.vault
parse.procuring_links() # execute get_doc processing
__
You'll need to correct the web_parser class too:
you have to choose between a parameter given during creation (link), or the hardcoded page_link, just adapt the method parser() to target the good one.
class web_parser:
def __init__(self, link=''):
self.link = link
self.vault = []
self.page_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
self.main_url = "https://www.yellowpages.com"
To fix the NameError you posted in your question, you need to add another parameter to __init__ of your subclass - and pass something to it.
class get_docs(web_parser):
#def __init__(self, new_links):
def __init__(self, link, new_links):
web_parser.__init__(self, link)
self.new_links = [new_links]
Although web_parser doesn't seem to do anything with that data so maybe just remove it from the base class.