Check whether people are online or offline in Whatapp - Python Selenium - python

I am trying to create a python-selenium project which checks whether the people in my whatsapp chat list are online or offline. Basically it bruteforces one by one to check whether someone is online or not and then it saves the data in a excel file. Also it gives a green background to the people who are online..
here is my code:
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from openpyxl import Workbook
from openpyxl.styles import PatternFill
import time
workbook = Workbook()
sheet = workbook.active
browser = webdriver.Chrome(executable_path=r"F:\software\chromedriver_win32\chromedriver.exe")
browser.get('https://web.whatsapp.com/')
print("Loading..\n")
for x in range(5,0,-1):
print(x)
time.sleep(1)
#the below function checks whether the 'online' element exists or not
#I got the class name by inspecting the WhatsappWeb page
def check_exists_by_xpath():
try:
browser.find_element_by_xpath('//span[#class="O90ur _3FXB1"]')
except NoSuchElementException:
return False
return True
count = 1
#the xpath gets the name of the persons on my chatlist
for iterator in browser.find_elements_by_xpath('//div[#class="_2wP_Y"]'):
iterator.click()
cellA = "A" + str(count)
cellB = "B" + str(count)
time.sleep(2)
name = browser.find_element_by_xpath('//div[#class="_3XrHh"]/span').text
if check_exists_by_xpath() == True:
sheet[cellA] = name
sheet[cellB] = " isOnline\n"
sheet[cellA].fill = PatternFill(start_color="a4d968", end_color="a4d968", fill_type = "solid")
sheet[cellB].fill = PatternFill(start_color="a4d968", end_color="a4d968", fill_type = "solid")
if check_exists_by_xpath() == False:
sheet[cellA] = name
sheet[cellB] = " isOffline\n"
count = count + 1
workbook.save(filename="WhatsApp_Data.xlsx")
print("Complete..!")
browser.close()
But I can't understand, why the code stops after collecting data of 18 people? Also can anyone find a better technique to achieve this, other than bruteforcing..
Actually the code just clicks on the names of the people in WhatsappWeb list and if the element which display the online message (beneath the name) - exists then returns true or else false..

Related

How to download dynamically loaded images using python and seleniumwire?

First of all I should inform you that I have very little experience in programming. And I have some trouble with the logic and flow of a general webscraper implemented in python. I assume that I should use callbacks and similar methods in order to properly control the process of saving pages from a javascript e-book reader. My script does work, but not consistently. If someone could advice me on improvements that should be made to this script, that would be great. Thank you.
from seleniumwire.utils import decode as sdecode
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options # [!]
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import os.path
opts = Options() # [!]
opts.add_experimental_option('w3c', True) # [!]
capabilities = DesiredCapabilities.CHROME.copy()
driver = webdriver.Chrome(chrome_options=opts, desired_capabilities=capabilities)
url = ' here comes url'
driver.get(url)
def get_requests():
l = []
for rx in driver.requests:
#endmark = '&scale=2&rotate=0' lenght must be 17
if rx.url[-17:]==endmark:
l.append(rx.url)
return list(set(l))
def savepages(diff):
newpages = 0
for urlitem in diff:
for request in driver.requests:
if request.url==urlitem:
#print(request.url)
ind = urlitem.find('.jp2&id') # ex. 0012.jp2&id
file_path = directory_path + '\\' + file_name + urlitem[ind-4:ind] + '.jpg'
tik = 0
while tik<10: #waiting for the response body data
try:
tik += 1
data = sdecode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
except AttributeError: # no data error
time.sleep(2) # wait for 2 sec for the data
continue
#data = data.decode("utf-8",'ignore')
# sometimes I get this error 'UnboundLocalError: local variable 'data' referenced before assignment'
# I assumed that the following condition will help but it doesn't seem to work consistently
if data:
with open(file_path, 'wb') as outfile:
outfile.write(data) # sometimes I get UnboundLocalError
else: print('no data')
# was the file saved or not
if os.path.exists(file_path):
newpages += 1 # smth is wrong with the counting logic, since pages+newpages should be equal to the lenght of li=get_requests(), I get more
else:
time.sleep(.5)
return newpages
count = 0 # a counter, should terminate the main delay loop
pages = 0 # counting all saved pages; book pages or images are equivalent, one turn should open 2 new pages/images/requests
oldli = [] #compare to the new list after each delay cycle
turns = 0 #count how many turns have been made or how many times we clicked on the button Next Page
li = get_requests() # get all unique requests of the images/pages, some requests might be still loading, but we manually opened the first page and visually confirmed that there are at least 1 or 3 images/requests
if li: # the program STARTS HERE, first try, there are some requests because we manually opened the first page
# THE MAIN CYCLE should stop when the delay is too long and we turned all the pages of the book
while 2*turns+1<len(li) or count<15: # should terminate the whole program when there is no more images coming
count = 0 #reset counter
success = False #reset success; new pages downloaded successfully
# the main delay counter
# what happens if diff is [] and no success
while True:
count += 1
if count > 14:
print('Time out after more than 10 seconds.')
break
li = get_requests() # in addition, I assume that all requests counting from page 1 will be kept
# it is possible that li will not have some of the old requests and oldli will be longer
# well, I need to keep all old requests in a separate list and then append to it
diff = list(set(li)-set(oldli)) # find new requests after the delay
if diff: # there are some new
npages = savepages(diff) # saves new images and returns the number of them
print('newpages ',npages, ' len diff ', len(diff)) # should be equal
if npages >= len(diff)-1: # we allow one request without a body with data ??
pages += npages # smth is not ok here, the number of pages sometimes exceeds the length of li
success = True # we call it a success
else:
print('Could not save pages. Newpages ', npages, ' len diff ', len(diff))
for pg in diff:
print(pg) # for debuging purposes
break # in this case you break from the delay cycle
else: time.sleep(2) # if no new requests add 2 sec to the waiting time
if success: # we turn pages in case of successful download, this is bad if we need to catch up
while 2*turns+1 < len(li): # if some of old requests are deleted then the program will stop earlier
# it won't wait for the bodies of requests, there is a problem
driver.find_elements(By.CLASS_NAME, "BRicon.book_right.book_flip_next")[0].click()
turns += 1
time.sleep(3) # I got the impression that this doesn't happen
oldli = li
print('pages ',pages,' length of list ',len(li))
break # we break from the delay cycle since success
time.sleep(2) # the main delay timer;; plus no diff timer = total time
else: print('no requests in the list to process') ```

Data getting scraped and printed is only the first entry of each page but I need all the data

I'm having an issue where the code I'm scraping is only printing out the first entry of each page. What I require is that all the data from all three pages of the website get scraped and added to the list 'infoList'.
What I assume is the problem the declaration 'CAR_INFO = 0' but I'm not sure how to fix it. Any tips or push in the right direction would be greatly appreciated.
my code:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import re
DRIVER_PATH = r"C:\Users\salmou\Downloads\chromedriver_win32\chromedriver.exe"
URL = "https://vancouver.craigslist.org/"
browser = webdriver.Chrome(DRIVER_PATH)
browser.get(URL)
time.sleep(4)
SEARCH_TERM = "Honda"
search = browser.find_element_by_css_selector("#query")
search.send_keys(SEARCH_TERM)
search.send_keys(u'\ue007')
class ScrapedData:
carInfo = ""
def __init__(self, carInfo):
self.carInfo = carInfo
def scrapedCarInfo(self):
print(SEARCH_TERM + " information: " + self.carInfo)
print("****")
infoList = []
for i in range(0,3):
content = browser.find_elements_by_css_selector(".hdrlnk")
for e in content:
start = e.get_attribute("innerHTML")
soup= BeautifulSoup(start, features=("lxml"))
rawString = soup.get_text().strip()
# print(soup.get_text())
# print("*****************************************************")
button = browser.find_element_by_css_selector(".next")
button.click()
time.sleep(3)
rawString = re.sub(r"[\n\t]*", "", rawString)
# Replace two or more consecutive empty spaces with '*'
rawString = re.sub('[ ]{2,}', '*', rawString)
infoArray = rawString.split('*')
CAR_INFO = 0
carInfo = infoArray[CAR_INFO]
objInfo = ScrapedData(carInfo)
infoList.append(objInfo)
for info in infoList:
info.scrapedCarInfo()
I see you have 2 loops: external with i and inner with e but I can't see any reference to the current i value in the loop. So it looks like you are performing the same action 3 times.
Also the rawString defined and evaluated in the internal loop is treated in the outer loop only. So only the latest value rawString received in the inner loop is treated in the outer loop. This may cause your problem.

Can I pause a scroll function in selenium, scrape the current data, and then continue scrolling later in the script?

I am a student working on a scraping project and I am having trouble completing my script because it fills my computer's memory with all of the data is stores.
It currently stores all of my data until the end, so my solution to this would be to break up the scrape into smaller bits and then write out the data periodically so it does not just continue to make one big list and then write out at the end.
In order to do this, I would need to stop my scroll method, scrape the loaded profiles, write out the data that I have collected, and then repeat this process without duplicating my data. It would be appreciated if someone could show me how to do this. Thank you for your help :)
Here's my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
driver = webdriver.Chrome()
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
Test.py
Displaying Test.py.
Try the below approach using requests and beautifulsoup. In the below script i have used the API URL fetched from website itself for ex:-API URL
First it will create the URL(refer first url) for first iteration, add headers and data in .csv file.
Second iteration it will again create the URL(refer second url) with 2 extra params start_on_page=20 & show_per_page=20 where start_on_page number 20 is incremented by 20 on each iteration and show_per_page = 100 defaulted to extract 100 records per iteration so on till all the data dumped in to the .csv file.second iteration API URL
Script is dumping 4 things number, name, location and profile url.
On each iteration data will be appended to .csv file , so your memory issue will get resolved by this approach.
Do not forget to add your system path in file_path variable where do you want to create .csv file before running the script.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
import csv
def scrap_directory_data():
list_of_credentials = []
file_path = ''
file_name = 'credential_list.csv'
count = 0
page_number = 0
page_size = 100
create_url = ''
main_url = 'https://directory.bcsp.org/search_results.php?'
first_iteration_url = 'first_name=&last_name=&city=&state=&country=&certification=&unauthorized=0&retired=0&specialties=&industries='
number_of_records = 0
csv_headers = ['#','Name','Location','Profile URL']
while True:
if count == 0:
create_url = main_url + first_iteration_url
print('-' * 100)
print('1 iteration URL created: ' + create_url)
print('-' * 100)
else:
create_url = main_url + 'start_on_page=' + str(page_number) + '&show_per_page=' + str(page_size) + '&' + first_iteration_url
print('-' * 100)
print('Other then first iteration URL created: ' + create_url)
print('-' * 100)
page = requests.get(create_url,verify=False)
extracted_text = bs(page.text, 'lxml')
result = extracted_text.find_all('tr')
if len(result) > 0:
for idx, data in enumerate(result):
if idx > 0:
number_of_records +=1
name = data.contents[1].text
location = data.contents[3].text
profile_url = data.contents[5].contents[0].attrs['href']
list_of_credentials.append({
'#':number_of_records,
'Name':name,
'Location': location,
'Profile URL': profile_url
})
print(data)
with open(file_path + file_name ,'a+') as cred_CSV:
csvwriter = csv.DictWriter(cred_CSV, delimiter=',',lineterminator='\n',fieldnames=csv_headers)
if idx == 0 and count == 0:
print('Writing CSV header now...')
csvwriter.writeheader()
else:
for item in list_of_credentials:
print('Writing data rows now..')
print(item)
csvwriter.writerow(item)
list_of_credentials = []
count +=1
page_number +=20
scrap_directory_data()

Looping send_keys() function causing 'NoneType' object error

I have a tuple (names) that has 2 lists in it. list1 contains last names, list2 contains first names. Im trying to loop through each list and send the first and last name to 2 textarea fields on a website. The first time the loop runs everyhthing works perfectly, but the second time I get an error Exception has occurred: TypeError 'NoneType' object is not subscriptable Im assuming the send_keys() is returning a value of none and is somehow screwing things up.
Here is the my code for generating the lists and tuple. The info is being read from an excel sheet.
def get_names(book):
f_names = []
l_names = []
workbook = xlrd.open_workbook(book)
worksheet = workbook.sheet_by_index(0)
for y in range(1, worksheet.nrows):
l_names.append(worksheet.cell_value(y, 0))
f_names.append(worksheet.cell_value(y, 1))
return (l_names, f_names)`
In my main program I have the following loop running:
import xlrd
from selenium import webdriver
url = 'https://website.com'
driver = webdriver.Chrome('/usr/local/bin/chromedriver')
driver.get(url)
for i in range(len(names[0])):
lName = driver.find_element_by_id('id_last_name')
lName.send_keys(names[0][i])
fName = driver.find_element_by_id('id_first_name')
fName.send_keys(names[1][I])
search_button = driver.find_element_by_xpath('//*[#id="table-responsive"]/table/tbody/tr[1]/td/div/table[2]/tbody/tr/td/table/tbody/tr[5]/td/table/tbody/tr[4]/td/table/tbody/tr[2]/td[2]/input').click()
Once send_keys() enters the name in the field. the program is suppose to start over and enter the next name in the list.
Can someone tell me how to fix this, or a better way of doing this?
Try initialize:
for i in range(len(names[0])):
lName, fname = "",""
lName = driver.find_element_by_id('id_last_name')
lName.send_keys(names[0][i])
fName = driver.find_element_by_id('id_first_name')
fName.send_keys(names[1][i])

Webdriver/BeautifulSoup Getting my program to check if part of a string exists on the webpage

I am writing a bot that purchases items automatically. The current way I am going about this is I am putting the product info in a dictionary titled INFO, and referencing it whenever I need a specific product/color/etc.
Currently my code (specifically in findProduct()) checks to see if the index in temp_tuple is the same as INFO['product'] for instance.
In my case, I look for a product and my code returns an error because there is a space at the end of some of the names, and my code cannot handle that.
However, I want to modify it to is check whether or not the string is on the webpage so that way my code runs even with that extra space.
Enough of my code that works as it is:
#!/usr/bin/env python3
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
import time
import requests
import bs4 as bs
from splinter import Browser
import helpers
from selenium.common.exceptions import ElementNotInteractableException
from config import INFO
def __init__(self, **info):
self.base_url = 'http://www.supremenewyork.com/'
self.shop = 'shop/all/'
self.checkout = 'checkout/'
self.info = info
class supremeBot(object):
def __init__(self, **info):
self.base_url = 'http://www.supremenewyork.com/'
self.shop = 'shop/all/'
self.info = info
def initializeBrowser(self):
driverz = self.info["driver"]
path = helpers.get_driver_path(driver)
if driverz == "geckodriver":
self.b = Browser()
elif driverz == "chromedriver":
executable_path = {"executable_path": path}
self.b = Browser('chrome', **executable_path)
#This looks for the product based on what the category is
def findProduct(self):
category = str(INFO['category'])
source = requests.get("http://www.supremenewyork.com/shop/all/"+category).text
soup = bs.BeautifulSoup(source, 'lxml')
temp_link = []
temp_tuple = []
for link in soup.find_all('a', href=True):
temp_tuple.append((link['href'], link.text))
for i in temp_tuple:
if i[1] == INFO['product'] or i[1] == INFO['color']: # <------------ I want this to recognize a partial string
temp_link.append(i[0])
#print(temp_link)
#This creates end of the final link
self.final_link = list(
set([x for x in temp_link if temp_link.count(x) == 2]))[0]
#Concatenates the previous link w/ the website
link = 'http://www.supremenewyork.com'+str(self.final_link)
driver.get(link)
if __name__ == "__main__":
driver = webdriver.Chrome('./chromedriver')
'''
BOT = supremeBot(**INFO)
BOT.findProduct()
order()
'''
BOT = supremeBot(**INFO)
found_product = False
counter = 1
max_iter = 5
while not found_product and counter < max_iter:
found_product = BOT.findProduct()
print("We tried ",counter," times.")
counter +=1
if found_product:
print('Couldn\'t find it')
continue
else:
print('found it')
order()
INFO = {
"driver": "chromedriver",
"product": "Supreme®/MLB New Era®", # "Big Duffle Bag " is an example of a product that has the space after it
"color": "Navy",
"category": "hats",
"size": "Medium",
"namefield": "Bucky McNuts",
"emailfield": "email#email.com",
"phonefield": "(555)555-5555",
"addressfield": "321 St",
}
In this case, if you were to replace Supreme®/MLB New Era® with "Big Duffle Bag " you'll see the code doesn't run if you removed the space after the word bag.
If anybody could help I would really appreciate it!
You can do this check for partial string:
if "part" in "partstring":
print("the word 'part' is within 'partsting'")
Possible use here:
if INFO['product'] in i[1].lower() or INFO['color'] in i[1].lower():
#do something
The .lower() is to make sure the text on the site is lower case

Categories

Resources