I am trying to use Luigi to build a small scraping pipeline and I'm using Pillow to save the images from the pages I scrape. However, I'm struggling with the output when I try to save each image in loop (e.g. I want to save img_1, img_2, img_3, etc. in the output folder). I tried to pass an "image_id" parameter within the output function but it doesn't work and I can't figure out how to accomplish this.
class DownloadImages(luigi.Task):
def requires(self):
pass # taking out dependencies for this example
def output(self, image_id):
return luigi.LocalTarget(f"img/img_{image_id}.jpeg")
def run(self):
resp = requests.get("https://my-site.com")
soup = BeautifulSoup(resp.content, "html.parser")
images_list = soup.select("img")
for image_id in range(len(images_list)):
image_url = images_list[image_id]["src"]
img = Image.open(requests.get(image_url, stream=True).raw)
img.save(self.output(image_id).path)
New answer since it's completely different:
python -m luigi --module scraper DownloadImages --local-scheduler
from PIL import Image
import requests
import luigi
class DownloadImages(luigi.Task):
save_path = f"img/*.jpg"
def output(self):
return luigi.LocalTarget(self.save_path)
def run(self):
img_ids = [1,2,3]
self.imgs = []
for img_id in img_ids:
img = Image.open(requests.get("https://i.kym-cdn.com/entries/icons/original/000/000/007/bd6.jpg", stream=True).raw)
img.save(self.save_path.replace("*", f"img_{img_id}"))
The Point of luigi is to have a workflow. It uses output to pass the location of your data between tasks. You cannot have additional arguments since you are not supposed to call that function (except to get the location where you want to save your output).
Disclaimer: I might have used it wrong too. Please go look at the Documentation
Related
I have been working on an I/O bound application where I will run multiple scripts at the same time depending on the args I will call for a script etc: monitor.py --s="sydsvenskan", monitor.py -ss="bbc" etc etc.
from __future__ import annotations
from abc import abstractmethod
from typing import ClassVar, Dict
from typing import Optional
import attr
import requests
from selectolax.parser import HTMLParser
#attr.dataclass
class Info:
"""Scraped info about news"""
all_articles: set = attr.ib(factory=set)
store: str = attr.ib(factory=str)
name: Optional[str] = attr.ib(factory=str)
image: Optional[str] = attr.ib(factory=str)
class Scraper:
scrapers: ClassVar[Dict[str, Scraper]] = {}
domain: ClassVar[str]
def __init_subclass__(cls) -> None:
Scraper.scrapers[cls.domain] = cls
#classmethod
def for_url(cls, domain, url) -> Scraper:
return cls.scrapers[domain](url)
#abstractmethod
def scrape_feed(self):
pass
#abstractmethod
def scrape_product(self):
pass
class BBCScraper(Scraper):
domain = 'BBC'
def __init__(self, url):
self.url = url
def scrape_feed(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
all_articles = {
f"https://www.BBC.se{product_link.attrs['href']}" for product_link in
doc.css('td.search-productnamne > a, div.product-image > a')
}
return Info(
store="BBC",
all_articles=all_articles
)
def scrape_product(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
# FIXME Scrape valid webelements
name = "Test"
image = "Test"
return Info(
store="BBC",
name=name,
image=image,
)
class SydsvenskanScraper(Scraper):
domain = 'Sydsvenskan'
def __init__(self, url):
self.url = url
def scrape_feed(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
all_articles = {
f"https://Sydsvenskan.se/{product_link.attrs['href']}" for product_link in
doc.css('div.product-image > a, td.search-productnamne > a')
}
return Info(
store="Sydsvenskan",
all_articles=all_articles
)
def scrape_product(self):
with requests.get(self.url) as rep:
# FIXME Better way than this atleast :P
if rep:
doc = HTMLParser(rep.text)
# FIXME Scrape valid webelements
name = "Test"
image = "Test"
return Info(
store="Sydsvenskan",
name=name,
image=image,
)
if __name__ == "__main__":
#FIXME Use arguments instead
domain = 'BBC'
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(domain, url)
r = scraper.scrape_feed()
print(r)
As you can currently see I have "hardcoded":
domain = 'BBC'
url = 'https://www.bbc.co.uk/'
which will currently be passed through arguments instead.
However as we can see if I start to add more "stores/news sites" in the class Scraper e.g. 40 different site, it would be pretty hard to navigate to the correct code if you want to maintain or do any changes.
I wonder how I can in that case split the code into different files where etc Sydsvenska will be for itself and BBC will be by itself. I can then easier maintain the code in the future if there will be any changes.
Ok I understand what you're looking for. And sorry to say you're out of luck. At least as far as my knowledge of python goes. You can do it two ways.
Use importlib to search through a folder/package tha contains those files and imports them into a list or dict to be retrieved. However you said you wanted to avoid this but either way you would have to use importlib. And #2 is the reason why.
Use a Base class that when inherited it's __init__ call adds the Derived class to a list or object that stores it and you can retrieve it via a class object. However the issue here is that if you move your derived class into a new file, that code wont run until you import it. So you would still need to explicitly import the file or implicitly import it via importlib (dynamic import).
So you'll have to use importlib (dynamic import) either way.
I learned recently that you can use wget -r -P ./pdfs -A pdf http://example.com/ to recursively download pdf files from a website. However this is not cross-platform as Windows doesn't have wget. I want to use Python to achieve the same thing. The only solutions I've seen are non-recursive - e.g. https://stackoverflow.com/a/54618327/3042018
I would also like to be able to just get the names of the files without downloading so I can check if a file has already been downloaded.
There are so many tools available in Python. What is a good solution here? Should I use one of the "mainstream" packages like scrapy or selenium or maybe just requests? Which is the most suitable for this task please, and how do I implement it?
You can try several more ways, maybe you can find the right one for you. Here's an example.
If it's just a single download, you can use the following methods.
from simplified_scrapy import req, utils
res = req.get("http://example.com/xxx.pdf")
path = "./pdfs/xxx.pdf"
utils.saveResponseAsFile(res, path)
If you need to download the page first, and then extract the PDF link from the page, you can use the following method。
import os, sys
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'download_pdf'
start_urls = ["http://example.com/"] # Entry page
def __init__(self):
Spider.__init__(self, self.name) #necessary
if (not os.path.exists('./pdfs')):
os.mkdir('./pdfs')
def afterResponse(self, response, url, error=None, extra=None):
try:
path = './pdfs' + url[url.rindex('/'):]
index = path.find('?')
if index > 0: path = path[:index]
flag = utils.saveResponseAsFile(response, path, fileType="pdf")
if flag:
return None
else: # If it's not a pdf, leave it to the frame
return Spider.afterResponse(self, response, url, error)
except Exception as err:
print(err)
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lst = doc.selects('a').containsReg(".*.pdf", attr="href")
for a in lst:
a["url"] = utils.absoluteUrl(url.url, a["href"])
return {"Urls": lst, "Data": None}
SimplifiedMain.startThread(MySpider()) # Start download
I'm very new to code so forgive any errors I make in explanation! I'm trying to write code on python that uses Praw to access the /r/pics subreddit, scrape the source urls and display them with urllib, cv2 and numpy.
Currently my code looks like this:
import praw
import numpy as np
import urllib
import cv2
# urllib set-up
def reddit_scrape(url):
resp = urllib.request.urlopen(url)
image = np.asarray(bytearray(resp.read()), dtype="uint8")
image = cv2.imdecode(image, cv2.IMREAD_COLOR)
return image
# reddit set-up
reddit = praw.Reddit(client_id = 'id',
client_secret = 'secret',
user_agent = 'agent')
subreddit = reddit.subreddit('pics')
hot_pics = subreddit.hot(limit=10)
for submission in hot_pics:
if not submission.stickied:
print(submission.url)
# print images
urls = [submission.url]
for url in urls:
image = reddit_scrape(url)
cv2.imshow('image', image)
cv2.waitKey(0)
My problem when I run this is that although the print(submission.url) line prints a full list of the top 10 posts, only the last url on the list is actually opened and displayed.
My guess is that the error lies somewhere in my definition of
urls = [submission.url]
But I can't define 'urls' to be a static list of urls, because the hot list changes over time.
What am I doing wrong? is there even a right way to do this? Any help would be greatly appreciated.
submission is whatever the last submission was at the end of your for loop. Instead of constructing urls outside the loop, so when you say urls = [submission.url] you're only getting the last url. Instead you should create a list and append them:
urls = []
for submission in hot_pics:
if not submission.stickied:
urls.append(submission.url)
Or even the more Pythonic:
urls = [submission.url for submission in hot_pics if not submission.stickied]
Then the for url in urls will loop through all the appended urls.
I've written some code in python and my intention is to supply the newly produced links by "web_parser" class to the "get_docs" class. However, I can't think of anything productive to do so. All I wanna do is bridge a connection between the two classes so that the "web_parser" class produce links and the "get_docs" class process them to get the refined output. Any idea as to how I can do it flawlessly will be highly appreciated. Thanks in advance.
from lxml import html
import requests
class web_parser:
page_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
main_url = "https://www.yellowpages.com"
def __init__(self, link):
self.link = link
self.vault = []
def parser(self):
self.get_link(self.page_link)
def get_link(self, url):
page = requests.get(url)
tree = html.fromstring(page.text)
item_links = tree.xpath('//h2[#class="n"]/a[#class="business-name"][not(#itemprop="name")]/#href')
for item_link in item_links:
self.vault.append(self.main_url + item_link)
class get_docs(web_parser):
def __init__(self, new_links):
web_parser.__init__(self, link)
self.new_links = [new_links]
def procuring_links(self):
for link in self.vault:
self.using_links(link)
def using_links(self, newly_created_link):
page = requests.get(newly_created_link)
tree = html.fromstring(page.text)
name = tree.findtext('.//div[#class="sales-info"]/h1')
phone = tree.findtext('.//p[#class="phone"]')
print(name, phone)
if __name__ == '__main__':
crawl = web_parser(web_parser.page_link)
parse = get_docs(crawl)
parse.parser()
parse.procuring_links()
I know a very little about creating classes so please forgive my ignorance. Upon execution at this stage I get an error:
web_parser.__init__(self, link)
NameError: name 'link' is not defined
I'm not very sure how you want to use it, by giving a parameter to web_parser or use an hardcoded link inside the class ?
From the commands you are using in __main__, you could process like below:
class get_docs(object):
def __init__(self, web_parser):
self.vault = web_parser.vault
if __name__ == '__main__':
crawl = web_parser() # create an instance
crawl.parser()
parse = get_docs(crawl) # give the instance to get_doc, or directly the vault with crawl.vault
parse.procuring_links() # execute get_doc processing
__
You'll need to correct the web_parser class too:
you have to choose between a parameter given during creation (link), or the hardcoded page_link, just adapt the method parser() to target the good one.
class web_parser:
def __init__(self, link=''):
self.link = link
self.vault = []
self.page_link = "https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=San%20Francisco%2C%20CA"
self.main_url = "https://www.yellowpages.com"
To fix the NameError you posted in your question, you need to add another parameter to __init__ of your subclass - and pass something to it.
class get_docs(web_parser):
#def __init__(self, new_links):
def __init__(self, link, new_links):
web_parser.__init__(self, link)
self.new_links = [new_links]
Although web_parser doesn't seem to do anything with that data so maybe just remove it from the base class.
I get an image from url and try to upload it to Odoo (product.template, image column). I tried many methods to do that but none of them helped me. Could you give me the right way to upload image of product to Odoo without using csv import.
This worked for me :
import urllib2
import base64
image = urllib2.urlopen('http://ddd.com/somepics.jpg').read()
image_base64 = base64.encodestring(image)
product.image_medium = image_base64 //(new api v9)
#in old api maybe something like
#prod_obj.write(prod_id, {'image_medium': image_base64})
you may need something like this
using a psycopg2 library
try:
logo = urllib2.urlopen(logo_url).read()
except:
print 'waitting 60s'
time.sleep(60)
logo = urllib2.urlopen(logo_url).read()
res_data={'image':psycopg2.Binary(logo)}
...
If you have image URL and need to set in product then you can do as following and call this method when install/upgrade your custom module.
import requests
import base64
#api.multi
def get_image(self):
for product in self:
img = False
if image.url:
response = requests.get(image.url)
if response.ok and response.content :
img = base64.b64encode(response.content)
else :
img = False
product.image = img