Python2 BeautifulSoup returns Blank output - python

This is the code I am using for downloading the images from Google page. This code is taking time in Evaluating and downloading the images. Hence, I thought of using the Beautifulsoup Library for faster evaluation and download. Check the below original code:
import time
import sys
import os
import urllib2
search_keyword = ['Australia']
keywords = [' high resolution']
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
return"Page Not found"
def _images_get_next_item(s):
start_line = s.find('rg_di')
if start_line == -1:
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
def _images_get_all_items(page):
items = []
while True:
item, end_content = _images_get_next_item(page)
if item == "no_links":
break
else:
items.append(item)
time.sleep(0.1)
page = page[end_content:]
return items
t0 = time.time()
i= 0
while i<len(search_keyword):
items = []
iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
print (iteration)
print ("Evaluating...")
search_keywords = search_keyword[i]
search = search_keywords.replace(' ','%20')
try:
os.makedirs(search_keywords)
except OSError, e:
if e.errno != 17:
raise
pass
j = 0
while j<len(keywords):
pure_keyword = keywords[j].replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + pure_keyword + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
time.sleep(0.1)
items = items + (_images_get_all_items(raw_html))
j = j + 1
print ("Total Image Links = "+str(len(items)))
print ("\n")
info = open('output.txt', 'a')
info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n")
info.close()
t1 = time.time()
total_time = t1-t0
print("Total time taken: "+str(total_time)+" Seconds")
print ("Starting Download...")
k=0
errorCount=0
while(k<len(items)):
from urllib2 import Request,urlopen
from urllib2 import URLError, HTTPError
try:
req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
response = urlopen(req,None,15)
output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')
data = response.read()
output_file.write(data)
response.close();
print("completed ====> "+str(k+1))
k=k+1;
except IOError:
errorCount+=1
print("IOError on image "+str(k+1))
k=k+1;
except HTTPError as e:
errorCount+=1
print("HTTPError"+str(k))
k=k+1;
except URLError as e:
errorCount+=1
print("URLError "+str(k))
k=k+1;
i = i+1
print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")
I thought editing the below code, will help in making the code work with BeautifulSoup Library and my work will be completed faster:
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
#response = urllib2.urlopen(req)
#page = response.read()
return BeautifulSoup(urlopen(Request(req)), 'html.parser')
except:
return"Page Not found"
But the above code is returning blank. Kindly, let me know what I might do to make the code work excellently well with BeautifulSoup without any trouble.

You can't just pass Google headers like that. The search engine is ALOT more complex than simply substituting some keywords into a GET URL.
HTML is a markup language only useful for one way rendering of human readable information. For your application, you need machine readable markup rather than trying to decipher human readable text. Google already has a very comprehensive API https://developers.google.com/custom-search/ which is easy to use, and a much better way of achieving this than using BeautifulSoup

Related

The new products added to the site in the loop while Webscraping are not detected by the bot [Python]

I have a bot for webscraping and it works fine. The only problem when a new product appears on the site, it is not detected by the bot. I have to relaunch the file for it to find. Do you have any solutions to this?
import requests
import time
import datetime
from discord import send_hook
from datetime import datetime
from colorama import Fore, Back, Style
from bs4 import BeautifulSoup
from colorama import init
init(autoreset=True)
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36', "Referer": "https://www.ldlc.com/"
}
a=0
def liens():
# for x in range (1,5):
url = f'https://www.ldlc.com/recherche/1660/+fcat-4684.html?sort=1'
reqs = requests.get(url,headers=headers, timeout=None, allow_redirects=False)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
productlist = soup.find_all('h3',{'class':'title-3'})
for item in productlist:
for link in item.find_all('a', href=True):
urls.append("https://www.ldlc.com"+link['href'])
print(f"Le nombre de request URL's : {a}")
return urls
prix = 400
wait_time = 259200
timed_pid = {p: 0 for p in liens()}
while True:
try:
liens()
a=a+1
print(f"Le nombre de request : {x}")
x=x+1
nombre_total_liens = len(liens())
print(f'Le nombre de liens trouvé : {nombre_total_liens}')
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(5)
print("Was a nice sleep, now let me continue...")
continue
time.sleep(15)
for p in timed_pid:
if (time.time() - timed_pid[p]) < wait_time:
continue
url = "{}".format(p)
response = url
while response == url:
try:
response = requests.get(url, headers=headers, timeout=None, allow_redirects=False)
break
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(5)
print("Was a nice sleep, now let me continue...")
continue
soup = BeautifulSoup(response.text, 'html.parser')
try:
title = soup.find('h1', class_='title-1').get_text().strip()
product_title = ':flag_fr:'+title
except:
print('Impossible de lire le titre du produit! Restart....')
continue
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
time_now = now.strftime("%d/%m/%Y - %H:%M:%S")
try:
image_url = soup.find('img', {'id':'ctl00_cphMainContent_ImgProduct'})['src']
except:
continue
try:
stock_statut = soup.find('div', class_='content').get_text().strip()
price = soup.find_all('div', class_='price')[3].get_text(strip=True).replace(',','.').replace(' ','').replace('€','.').replace(' ','').replace('\xa0','')
if stock_statut == 'Rupture':
print ('[{}]'.format(current_time), Fore.GREEN + Style.RESET_ALL, Fore.YELLOW + '[LDLC]' + Style.RESET_ALL,'|', Fore.RED + 'Rupture' + Style.RESET_ALL, Fore.CYAN + product_title + Style.RESET_ALL)
else:
f = float(price)
if f < prix:
print('[{}]'.format(current_time), Fore.GREEN + Style.RESET_ALL, Fore.YELLOW + '[LDLC]' + Style.RESET_ALL,'|', Fore.GREEN + 'En stock' + Style.RESET_ALL, Fore.CYAN + product_title + Style.RESET_ALL)
timed_pid[p] = time.time()
send_hook(product_title, url, image_url, price, stock_statut, time_now)
except:
continue
occasionally if you have improvements for speed or something else I will take it.
If you could help me, that will make me happy. I thank you in advance.

multiprocessing ThreadPool stops at the end

I wrote a script that "parses" all domains from the file. After the launch, everything works as it should. But when there are several domains left at the end, it gets stuck. Sometimes it takes a long time to parse the last couple of domains. I can't figure out what the problem is. Who has faced such a situation? Tell me how to cure it.
After the launch, everything works out very quickly (as it should) until the end. At the end, it stops when there are several domains left. There is no difference, 1000 domains or 10 000 domains.
Complete code:
import re
import sys
import json
import requests
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
pool = 100
with open("Rules.json") as file:
REGEX = json.loads(file.read())
ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'}
def Domain_checker(domain):
try:
r = requests.get("http://" + domain, verify=False, headers=ua)
r.encoding = "utf-8"
for company in REGEX.keys():
for type in REGEX[company]:
check_entry = 0
for ph_regex in REGEX[company][type]:
if bool(re.search(ph_regex, r.text)) is True:
check_entry += 1
if check_entry == len(REGEX[company][type]):
title = BeautifulSoup(r.text, "lxml")
Found_domain = "\nCompany: {0}\nRule: {1}\nURL: {2}\nTitle: {3}\n".format(company, type, r.url, title.title.text)
print(Found_domain)
with open("/tmp/__FOUND_DOMAINS__.txt", "a", encoding='utf-8', errors = 'ignore') as file:
file.write(Found_domain)
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.TooManyRedirects:
pass
except requests.exceptions.InvalidSchema:
pass
except requests.exceptions.InvalidURL:
pass
except UnicodeError:
pass
except requests.exceptions.ChunkedEncodingError:
pass
except requests.exceptions.ContentDecodingError:
pass
except AttributeError:
pass
except ValueError:
pass
return domain
if __name__ == '__main__':
with open(sys.argv[1], "r", encoding='utf-8', errors = 'ignore') as file:
Domains = file.read().split()
pool = 100
print("Pool = ", pool)
results = ThreadPool(pool).imap_unordered(Domain_checker, Domains)
string_num = 0
for result in results:
print("{0} => {1}".format(string_num, result))
string_num += 1
with open("/tmp/__FOUND_DOMAINS__.txt", encoding='utf-8', errors = 'ignore') as found_domains:
found_domains = found_domains.read()
print("{0}\n{1}".format("#" * 40, found_domains))
requests.get("http://" + domain, headers=ua, verify=False, timeout=10)
The problem is resolved after installing timeout
Thank you to the user with the nickname "eri" (https://ru.stackoverflow.com/users/16574/eri) :)

python django, How to prohibit go to the next code from for loop

def redirectTest(item):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
r = None
try:
r = requests.head(item, allow_redirects=False, headers=headers)
except Exception as e:
print(e)
if r is not None:
if r.status_code == 301:
print("Tested: " + str(r.status_code))
elif r.status_code == 302:
print("Tested: " + str(r.status_code))
else:
print("Tested: " + str(r.status_code))
except requests.exceptions.RequestException as e:
print('error: ' + e)
return
#ensure_csrf_cookie
def re_check_url(request):
if request.method == "POST":
if request.is_ajax():
resolved_urls = ['twitch.tv/yumyumyu77']
scheme_list = ['http://www.', 'http://', 'https://www.', 'https://']
for item in resolved_urls:
for scheme_item in scheme_list:
redirectTest(scheme_item + item)
return JsonResponse({'res': 1})
return JsonResponse({'res': 2})
This code checks scheme + some url's responses.
But when I execute the code, my Django terminal prints:
r_status_code: 301
r_status_code: 301
r_status_code: 200
[22/Oct/2018 23:54:49] "POST /re/check/url/ HTTP/1.1" 200 10
r_status_code: 301
Problem:
I think it means that return JsonResponse({'res': 1}) this line is ahead, and print("Tested: " + str(r.status_code)) this line is after or later.
Sometimes it prints ordinarily, but sometimes it prints abnormally.
Question:
I learned that Python code is executed line by lines from top to bottom, but It seems like it is not doing so.
Why does this happened? and how can I fix it?
It's executing order is not what I expected.
Edit:
I tried to use Lock()
for item in resolved_urls:
for scheme_item in scheme_list:
from threading import Lock
_lock = Lock()
with _lock:
redirectTest(scheme_item + item)
But It does not seems to be working well.
Everything is actually working correctly. 200 is success, and it redirects to the success page. Then it runs your last loop.

the downloaded image's content-length is zero

I use python to download images from some website, some times the image's content-length is zero. The image can be accessed normally in web browser.
I have tried three methodes, and get the same result, so how to resolve this problem?
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 20 13:51:42 2017
"""
import urllib
import urllib2
import re
import uuid
import os
import requests
from lxml import etree
from multiprocessing import Pool
url = 'https://www.sina.com.cn/'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
request = urllib2.Request(url)
request.add_header('User-Agent', user_agent)
response = urllib2.urlopen(request)
content = response.read()
tree=etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
node=tree.xpath("//img/#src")
dic1={}
dic2={}
localPath='E:\\pictures\\'
def generateFileName():
return str(uuid.uuid1())
def createFileWithFileName(localPathParam,fileName):
totalPath=localPathParam+'\\'+fileName
if not os.path.exists(totalPath):
file=open(totalPath,'wb')
file.close()
return totalPath
def worker(i):
path = node[i]
if not (dic1.has_key(path)):
dic1[path] = 1
index = path.rfind('/')
suffix = path[index+1:]
filename = suffix
#filename = generateFileName()+'.'+suffix
if(re.search(r'^(https?:)?\/\/', path)):
#print('save picture %s as %s' % (path,filename))
'''
#this code get the same result too
try:
urllib.urlretrieve(path, createFileWithFileName(localPath, filename))
except Exception, ex:
print(ex.message)
'''
with open(localPath + filename, 'wb') as handle:
response = requests.get(path, timeout=60)
if not response.ok:
print response
else:
print 'wrong when get ' + path
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
'''
#this code get the same result too
try:
req = urllib2.Request(path)
req.add_header('User-Agent', user_agent)
picture = urllib2.urlopen(url=path, timeout=5).read()
document = open(localPath+filename,'wb')
document.write(picture)
document.close()
except Exception, ex:
print(ex.message)
'''
if __name__=='__main__':
p = Pool()
for i in range(len(node)):
p.apply_async(worker, args=(i,))
print 'Waiting for all subprocesses done...'
p.close()
p.join()
print 'All subprocesses done.'

python request urls parallel [duplicate]

This question already has an answer here:
How to send multiple http requests python
(1 answer)
Closed 6 years ago.
I created the following script to download images from an API endpoint which works as intended. Thing is that it is rather slow as all the requests have to wait on each other. What is the correct way to make it possible to still have the steps synchronously for each item I want to fetch, but make it parallel for each individual item. This from an online service called
servicem8
So what I hope to achieve is:
fetch all possible job ids => keep name/and other info
fetch name of the customer
fetch each attachment of a job
These three steps should be done for each job. So I could make things parallel for each job as they do not have to wait on each other.
Update:
Problem I do not understand is how can you make sure that you bundle for example the three calls per item in one call as its only per item that I can do things in parallel so for example when I want to
fetch item( fetch name => fetch description => fetch id)
so its the fetch item I want to make parallel?
The current code I have is working but rather slow:
import requests
import dateutil.parser
import shutil
import os
user = "test#test.com"
passw = "test"
print("Read json")
url = "https://api.servicem8.com/api_1.0/job.json"
r = requests.get(url, auth=(user, passw))
print("finished reading jobs.json file")
scheduled_jobs = []
if r.status_code == 200:
for item in r.json():
scheduled_date = item['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 10:
if parsed_date.day == 10:
url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item[
'company_uuid'])
c = requests.get(url_customer, auth=(user, passw))
cus_name = c.json()['name']
scheduled_jobs.append(
[item['uuid'], item['generated_job_id'], cus_name])
except ValueError:
pass
for job in scheduled_jobs:
print("fetch for job {}".format(job))
url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[
0])
r = requests.get(url, auth=(user, passw))
if r.json() == []:
pass
for attachment in r.json():
if attachment['active'] == 1 and attachment['file_type'] != '.pdf':
print("fetch for attachment {}".format(attachment))
url_staff = "https://api.servicem8.com/api_1.0/Staff.json?%24filter=uuid%20eq%20{}".format(
attachment['created_by_staff_uuid'])
s = requests.get(url_staff, auth=(user, passw))
for staff in s.json():
tech = "{}_{}".format(staff['first'], staff['last'])
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream=True)
if r.status_code == 200:
creation_date = dateutil.parser.parse(
attachment['timestamp']).strftime("%d.%m.%y")
if not os.path.exists(os.getcwd() + "/{}/{}".format(job[2], job[1])):
os.makedirs(os.getcwd() + "/{}/{}".format(job[2], job[1]))
path = os.getcwd() + "/{}/{}/SC -O {} {}{}".format(
job[2], job[1], creation_date, tech.upper(), attachment['file_type'])
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print(r.text)
Update [14/10]
I updated the code in the following way with some hints given. Thanks a lot for that. Only thing I could optimize I guess is the attachment downloading but it is working fine now. Funny thing I learned is that you cannot create a CON folder on a windows machine :-) did not know that.
I use pandas as well just to try to avoid some loops in my list of dicts but not sure if I am already most performant. Longest is actually reading in the full json files. I fully read them in as I could not find an API way of just telling the api, return me only the jobs from september 2016. The api query function seems to work on eq/lt/ht.
import requests
import dateutil.parser
import shutil
import os
import pandas as pd
user = ""
passw = ""
FOLDER = os.getcwd()
headers = {"Accept-Encoding": "gzip, deflate"}
import grequests
urls = [
'https://api.servicem8.com/api_1.0/job.json',
'https://api.servicem8.com/api_1.0/Attachment.json',
'https://api.servicem8.com/api_1.0/Staff.json',
'https://api.servicem8.com/api_1.0/Company.json'
]
#Create a set of unsent Requests:
print("Read json files")
rs = (grequests.get(u, auth=(user, passw), headers=headers) for u in urls)
#Send them all at the same time:
jobs,attachments,staffs,companies = grequests.map(rs)
#create dataframes
df_jobs = pd.DataFrame(jobs.json())
df_attachments = pd.DataFrame(attachments.json())
df_staffs = pd.DataFrame(staffs.json())
df_companies = pd.DataFrame(companies.json())
#url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item['company_uuid'])
#c = requests.get(url_customer, auth=(user, passw))
#url = "https://api.servicem8.com/api_1.0/job.json"
#jobs = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading attachments json")
#url = "https://api.servicem8.com/api_1.0/Attachment.json"
#attachments = requests.get(url, auth=(user, passw), headers=headers)
#print("Reading staff.json")
#url_staff = "https://api.servicem8.com/api_1.0/Staff.json"
#staffs = requests.get(url_staff, auth=(user, passw))
scheduled_jobs = []
if jobs.status_code == 200:
print("finished reading json file")
for job in jobs.json():
scheduled_date = job['job_is_scheduled_until_stamp']
try:
parsed_date = dateutil.parser.parse(scheduled_date)
if parsed_date.year == 2016:
if parsed_date.month == 9:
cus_name = df_companies[df_companies.uuid == job['company_uuid']].iloc[0]['name'].upper()
cus_name = cus_name.replace('/', '')
scheduled_jobs.append([job['uuid'], job['generated_job_id'], cus_name])
except ValueError:
pass
print("{} jobs to fetch".format(len(scheduled_jobs)))
for job in scheduled_jobs:
print("fetch for job attachments {}".format(job))
#url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[0])
if attachments == []:
pass
for attachment in attachments.json():
if attachment['related_object_uuid'] == job[0]:
if attachment['active'] == 1 and attachment['file_type'] != '.pdf' and attachment['attachment_source'] != 'INVOICE_SIGNOFF':
for staff in staffs.json():
if staff['uuid'] == attachment['created_by_staff_uuid']:
tech = "{}_{}".format(
staff['first'].split()[-1].strip(), staff['last'])
creation_timestamp = dateutil.parser.parse(
attachment['timestamp'])
creation_date = creation_timestamp.strftime("%d.%m.%y")
creation_time = creation_timestamp.strftime("%H_%M_%S")
path = FOLDER + "/{}/{}/SC_-O_D{}_T{}_{}{}".format(
job[2], job[1], creation_date, creation_time, tech.upper(), attachment['file_type'])
# fetch attachment
if not os.path.isfile(path):
url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
'uuid'])
r = requests.get(url, auth=(user, passw), stream = True)
if r.status_code == 200:
if not os.path.exists(FOLDER + "/{}/{}".format(job[2], job[1])):
os.makedirs(
FOLDER + "/{}/{}".format(job[2], job[1]))
print("writing file to path {}".format(path))
with open(path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print("file already exists")
else:
print(r.text)
General idea is to use asynchronous url requests and there is a python module named grequests for that-https://github.com/kennethreitz/grequests
From Documentation:
import grequests
urls = [
'http://www.heroku.com',
'http://python-tablib.org',
'http://httpbin.org',
'http://python-requests.org',
'http://fakedomain/',
'http://kennethreitz.com'
]
#Create a set of unsent Requests:
rs = (grequests.get(u) for u in urls)
#Send them all at the same time:
grequests.map(rs)
And the resopnse
[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, None, <Response [200]>]

Categories

Resources