download images with google customsearch api - python

My question is probably trivial. But I tried to configure my customsearch engine to download the first 10 results of google image. Here is python script. It works except than my result a very far from the one of google image with the same parameter. Does someone can tell me what I miss?
# -*- coding: utf-8 -*-
import os
import sys
from urllib import FancyURLopener
import urllib2
import simplejson
from apiclient.discovery import build
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
myopener = MyOpener()
service = build("customsearch", "v1",
developerKey="** my Key **")
res = service.cse().list(
q='SearchTerm',
cx='** my cx**',
searchType='image',
num=10,
imgType='photo',
fileType='jpg',
imgSize="xxlarge"
).execute()
count=0
for item in res['items']:
myopener.retrieve(item['link'],str(count))
count=count+1

Related

how to detect changes on website? python web scraping

I started doing a monitor python for a shoe website. Now I would like to know if there is a way to know when the site is updated. For example: if there is a change in the available shoe sizes -> Send webhook to my discord
I don't know how to detect changes on the site.. please help me. If you have an idea, let me to know :)
img webhook discord
from dhooks import Webhook, Embed
import requests
import bs4
from bs4 import BeautifulSoup
import lxml
url = "https://en.aw-lab.com/women/shoes/new-arrivals-AW_10008AAQB.html?cgid=women_shoes_newin&dwvar_AW__10008AAQB_color=5011614"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
res = requests.get(url, headers=headers)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
img_shoes = "https://en.aw-lab.com/dw/image/v2/BCLG_PRD/on/demandware.static/-/Sites-awlab-master-catalog/default/dwd9415a8e/images/large/5011614_0.jpg?sw=843"
size = soup.select(".b-size-selector__item-0")
array_size = []
url_shoes = "[ADIDAS SUPERSTAR BOLD](" + url + ")"
embed = Embed(
description=url_shoes,
color=0x5CDBF0,
timestamp='now'
)
for sizes in size:
get_sizes = sizes.getText()
array_size.append(get_sizes.strip())
embed.add_field(name="Size", value=('\n'.join(map(str, array_size))))
embed.set_thumbnail(img_shoes)
hook.send(embed=embed)
You can use the hashlib module to compute a checksum of the page, save it and then compute it again to check if it changed. NOTE: any subtle change will change the checksum!
import hashlib
# ...
checksum = hashlib.sha256(res.text.encode('utf-8')).hexdigest()
# save it to a txt file as a comparison for the next accesses

Retrieving links from a Google search using BeautifulSoup in Python

I'm building a Twitter bot using Tweepy and BeautifulSoup4. I'd like to save in a list the results of a request but my script isn't working anymore (but it was working days ago). I've been looking at it and I don't understand. Here is my function:
import requests
import tweepy
from bs4 import BeautifulSoup
import urllib
import os
from tweepy import StreamListener
from TwitterEngine import TwitterEngine
from ConfigEngine import TwitterAPIConfig
import urllib.request
import emoji
import random
# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
# Récupération des liens
def parseLinks(url):
headers = {"user-agent": USER_AGENT}
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_='r'):
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
results.append(link)
return results
The "url" parameter is 100% correct in the rest of the code. As an output, I get a "None". To be more precise, the execution stops right after line "results = []" (so it doesn't enter into the for).
Any idea?
Thank you so much in advance!
It seems that Google changed the HTML markup on the page. Try to change the search from class="r" to class="rc":
import requests
from bs4 import BeautifulSoup
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
def parseLinks(url):
headers = {"user-agent": USER_AGENT}
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_='rc'): # <-- change 'r' to 'rc'
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
results.append(link)
return results
url = 'https://www.google.com/search?q=tree'
print(parseLinks(url))
Prints:
['https://en.wikipedia.org/wiki/Tree', 'https://simple.wikipedia.org/wiki/Tree', 'https://www.britannica.com/plant/tree', 'https://www.treepeople.org/tree-benefits', 'https://books.google.sk/books?id=yNGrqIaaYvgC&pg=PA20&lpg=PA20&dq=tree&source=bl&ots=_TP8PqSDlT&sig=ACfU3U16j9xRJgr31RraX0HlQZ0ryv9rcA&hl=sk&sa=X&ved=2ahUKEwjOq8fXyKjsAhXhAWMBHToMDw4Q6AEwG3oECAcQAg', 'https://teamtrees.org/', 'https://www.woodlandtrust.org.uk/trees-woods-and-wildlife/british-trees/a-z-of-british-trees/', 'https://artsandculture.google.com/entity/tree/m07j7r?categoryId=other']

I can't login to the site with Requests Sessions

I want to login to the website by requests sessions.
For example:
https://turbobit.net
But can't login normally, the code is as follows:
# coding=utf-8
import lxml
import re
import requests
import socket
import socks
import sys
import time
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://turbobit.net/user/login'
header = {
'Host': 'turbobit.net',
'Referer': 'https://turbobit.net/user/login',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3673.0 Safari/537.36'
}
form_data = {
'user[login]': 'your_email',
'user[pass]': 'your_password'
}
session = requests.session()
login = session.post(url, data=form_data, headers=header)
page = session.get('https://turbobit.net/').content
soup = BeautifulSoup(page, features='html.parser')
msgs = soup.find('div', {"class": 'logged'})
for msg in msgs:
print(msg.get_text())
BeautifulSoup does not interact with web application in realtime, It just takes data as input (string/ byte-string) and parses it as properly formatted HTML.
If you want to simulate interactions with web applications such as clicking, entering text and logging in. You should try other options like selenium which is a Browser Automation Framework and it comes very handy in such cases.
Here's an example to perform automated login via python script
First of all find out the name of the inputs used on the websites form for usernames 
<form ... name=username ... /> 
and passwords 
<form ... name=password ... /> 
and replace them in the script below. Also replace the URL to point at the desired site to log into.
Example code: login.py
#!/usr/bin/env python
import requests
payload = { 'username': 'user#email.com', 'password': 'sup3rs3cretp4ssw0rd' }
url = 'https://example.com/login.html'
requests.post(url, data=payload, verify=False)
Bonus:
To run this script from the command line on a UNIX based system place it in a directory, i.e. home/scripts and add this directory to your path in ~/.bash_profile or a similar file used by the terminal.
# Custom scripts export
CUSTOM_SCRIPTS=home/scripts
export PATH=$CUSTOM_SCRIPTS:$PATH
Then create a link to this python script inside home/scripts/login.py
ln -s ~/home/scripts/login.py ~/home/scripts/login
Close your terminal, start a new one, run login.
I have successfully logged in, the code is as follows:
# coding=utf-8
import lxml
import re
import requests
import socket
import socks
import sys
import time
from bs4 import BeautifulSoup
from urllib.request import urlopen
from requests import Request, Session
email = "your_email"
password = "yor_password"
s = requests.Session()
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3673.0 Safari/537.36"
headers = {
'Authority':'turbobit.net',
'Method':'POST',
'Path':'/lang/en',
'Host': 'turbobit.net',
'Referer': 'https://turbobit.net/login',
'User-Agent': user_agent
}
def login_site(email, password):
login_url = 'https://turbobit.net/user/login'
form_data = {
'user[login]': email,
'user[pass]': password,
'user[submit]':'Sign in',
'user[memory]':'on'
}
login = s.post(login_url, data=form_data, headers=headers)
# print(f"text = {login.text}")
soup = BeautifulSoup(login.text, "lxml")
'''If the login is successful, there will be "<div class="lang-links in">" in the output.'''
msgs = soup.find('div', {"class": "user-menu"})
print(msgs)
login_site(email, password)
Thanks you for your help!

Can't download image with python urllib

I am trying to download an image with python and urllib.
This is my first attempt:
import urllib
url = "https://xxxxxxxxxxxxxxxxxxxxxxxxxx.jpg"
urllib.urlretrieve(url, "myimage.jpg")
The result is an empty (0 Byte) file called "myimage.jpg"
The image is accessible from browser, from the same link. So I tried change the use user agent, using this script I found:
from urllib import FancyURLopener
url = "https://xxxxxxxxxxxxxxxxxxxxxxxxxx.jpg"
class MyOpener(FancyURLopener, object):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
myopener = MyOpener()
myopener.retrieve(url, 'myimage.jpg')
The result is again an empty (0 Byte) file called "myimage.jpg".
Additional notes:
The robots.txt file is not accessible from browser: "access denied error" code 403.
In the url there is the word: 'ssl'
What can I do?
EDIT: The image is linked from another web domain. I noticed that the image is accessible from browser only if the first time I opened the image from this specific web domain. If I clear the cookies the image become unaccessible.
It works if the URL exists:
import urllib
url = "https://www.lhorn.de/images/6cfYoU3.png"
png = urllib.urlretrieve(url, "nodejs-1995.png")

Proxy Scraper from Python 2.x to Python 3.x conversion

I was trying to convert a very simple function on Python 2 to Python 3 that would scrape a web page and return a list of proxys so I could use on a Twitter robot:
#!/usr/bin/env python
#python25 on windows7
#####################################
# GPL v2
# Author: Arjun Sreedharan
# Email: arjun024#gmail.com
#####################################
import urllib2
import re
import os
import time
import random
def main():
request = urllib2.Request("http://www.ip-adress.com/proxy_list/")
# request.add_header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5")
#Without Referer header ip-adress.com gives 403 Forbidden
request.add_header("Referer","https://www.google.co.in/")
f = urllib2.urlopen(request)
#outfile = open('outfile.htm','w')
str1 = f.read()
#outfile.write(str1)
# normally DOT matches anycharacter EXCEPT newline. re.DOTALL makes dot
include newline
pattern = re.compile('.*<td>(.*)</td>.*<td>Elite</td>.*', re.DOTALL)
matched = re.search(pattern,str1)
print(matched.group(1))
"""
ip = matched.group(1)
os.system('echo "http_proxy=http://'+ip+'" > ~/.wgetrc')
if random.randint(1,2)==1:
os.system('wget --proxy=on -t 1 --timeout=14 --header="User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5" http://funnytweets.in -O /dev/null')
else:
os.system('wget --proxy=on -t 1 --timeout=14 --header="User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13" http://funnytweets.in -O /dev/null')
"""
if __name__ == '__main__':
while True:
main()
time.sleep(2)
Ok, I already know that the urllib2 is diferent on P3 but i could not make it work :( Anyone can help? :) thanks!
In Python3 Request and urlopen are located in the urllib.request module, so hou have to change your imports accordingly.
from urllib.request import Request, urlopen
You could make your code Python2 and Python3 compatible if you catch ImportError exceptions when importing from urllib2.
try :
from urllib2 import Request, urlopen
except ImportError:
from urllib.request import Request, urlopen
Also keep in mind that URLError and HTTPError are located in urllib.error, if you need them.

Categories

Resources