crawling about google app store error - python

I use this code to crawl this code.
the first few come out well
but error
I want your advice.
what can I do?
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup, Comment
import pandas as pd
#Setting up Chrome webdriver Options
#chrome_options = webdriver.ChromeOptions()
#setting up local path of chrome binary file
#chrome_options.binary_location = "/Users/Norefly/chromedriver2/chromedriver.exec"
#creating Chrome webdriver instance with the set chrome_options
driver = webdriver.PhantomJS("C:/Python/phantomjs-2.1.1-windows/bin/phantomjs.exe")
link = "https://play.google.com/store/apps/details?id=com.supercell.clashofclans&hl=en"
driver.get(link)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
Ptitle = driver.find_element_by_class_name('id-app-title').text.replace(' ','')
print(Ptitle)
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]').click()
sleep(1)
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#select_newest.select_by_visible_text('Newest')
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
sleep(2)
#driver.find_element_by_css_selector('.review-filter.id-review-sort-filter.dropdown-menu-container').click()
driver.find_element_by_css_selector('.displayed-child').click()
#driver.find_element_by_xpath("//button[#data-dropdown-value='1']").click()
driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
reviews_df = []
for i in range(1,10000):
try:for elem in driver.find_elements_by_class_name('single-review'):
print(str(i))
content = elem.get_attribute('outerHTML')
soup = BeautifulSoup(content, "html.parser")
#print(soup.prettify())
date = soup.find('span',class_='review-date').get_text()
rating = soup.find('div',class_='tiny-star')['aria-label'][6:7]
title = soup.find('span',class_='review-title').get_text()
txt = soup.find('div',class_='review-body').get_text().replace('Full Review','')[len(title)+1:]
print(soup.get_text())
temp = pd.DataFrame({'Date':date,'Rating':rating,'Review
Title':title,'Review Text':txt},index=[0])
print('-'*10)
reviews_df.append(temp)
#print(elem)
except:
print('s')
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
reviews_df = pd.concat(reviews_df,ignore_index=True)
reviews_df.to_csv(Ptitle+'review_google.csv', encoding='utf-8')
#driver.close()
This error occurred during the crawl but i don't understand this error.
The operating system is Windows, I'm analyzing it with Python and use phantomjs.
*google playstore crawling
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.ElementNotVisibleException:
Message: {
"errorMessage": "Element is not currently visible and may not be manipulated",
"request":{
"headers":{
"Accept":"application/json",
"Accept-Encoding":"identity",
"Connection":"close",
"Content-Length":"81",
"Content-Type":"application/json; charset=UTF-8",
"Host":"127.0.0.1:58041",
"User-Agent":"Python http auth"
},
"httpVersion":"1.1",
"method":"POST",
"post":"{
\"id\": \":wdc:1505360987512\",
\"sessionId\": \"b7c59070-98ff-11e7-8363-fdfc8cdfd230 \"
}",
"url":"/click",
"urlParsed": {
"anchor":"",
"query":"",
"file":"click",
"directory":"/",
"path":" /click",
"relative":" /click",
"port":"",
"host":"",
"password":"",
"user":"",
"userInfo":"",
"authority":" ",
"protocol":"",
"source":"/click",
"queryKey":{},
"chunks": ["click"]
},
"urlOriginal":"/session/b7c59070-98ff-11e7-8363-fdfc8cdfd230/element /:wdc:1505360987512/click"
}
}
Screenshot: available via screen

Related

Insert value in searchbar, select autocomplete result and get value by bs4

I am trying to use Beautiful Soup to read a value from a web page. The following steps are necessary:
go to the webpage:
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
insert the ISIN in the searchbar
3. select the autocomplete-results from the container msci-ac-search-data-dropdown (click)
4. read the value from the "div class: ratingdata-outercircle esgratings-profile-header-green" to get the text: "ratingdata-fund-rating esg-fund-ratings-circle-aaa".
so far i have tried the following:
import requests
from bs4 import BeautifulSoup
isin = 'IE00B4L5Y983'
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
payload = {}
for i in soup.select('form[action="https://www.msci.com/search"] input[value]'):
payload[i['name']] = i['value']
payload['UQ_txt'] = isin
Try:
import requests
from bs4 import BeautifulSoup
isin = "IE00B4L5Y983"
url = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"X-Requested-With": "XMLHttpRequest",
}
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "searchFundRatingsProfiles",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_keywords": isin,
}
data = requests.get(url, params=params, headers=headers).json()
params = {
"p_p_id": "esg_fund_ratings_profile",
"p_p_lifecycle": "2",
"p_p_state": "normal",
"p_p_mode": "view",
"p_p_resource_id": "showEsgFundRatingsProfile",
"p_p_cacheability": "cacheLevelPage",
"_esg_fund_ratings_profile_fundShareClassId": data[0]["url"],
}
headers["Referer"] = "https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/{}/{}".format(
data[0]["encodedTitle"], data[0]["url"]
)
soup = BeautifulSoup(
requests.get(url, params=params, headers=headers).content, "html.parser"
)
data = soup.select_one(".ratingdata-fund-rating")["class"]
print(data)
Prints:
['ratingdata-fund-rating', 'esg-fund-ratings-circle-aaa']
When you press enter, you send another request, which already shows the search result. Here is an example of how to get what you want
import requests
isin = 'IE00B4L5Y983'
url = f"https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings?p_p_id=esg_fund_ratings_profile&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchFundRatingsProfiles&p_p_cacheability=cacheLevelPage&_esg_fund_ratings_profile_keywords={isin}"
for title in requests.get(url).json():
print(title['title'])
OUTPUT:
iShares Core MSCI World UCITS ETF USD (Acc)
If I may: from the OP's description I can only infer this is either an education related test, either a job interview related test. As such, following the exact instructions is paramount. In order to follow said instructions, you can only use selenium. The following code will work 'a la point', and get the desired result:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://www.msci.com/our-solutions/esg-investing/esg-fund-ratings/funds/'
browser.get(url)
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.ID, '_esg_fund_ratings_profile_keywords'))).send_keys('IE00B4L5Y983')
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.ID, 'ui-id-1')))
result = browser.find_element(By.ID, "ui-id-1")
result.click()
WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'esgratings-profile-header-green')))
result = browser.find_element(By.CLASS_NAME, "esgratings-profile-header-green").find_element(By.TAG_NAME, "div").get_attribute('class')
print(result)
browser.quit()
This will return:
ratingdata-fund-rating esg-fund-ratings-circle-aaa

Scraping multiple select options using Selenium

I am required to scrape PDF's from the website https://secc.gov.in/lgdStateList. There are 3 drop-down menus for a state, a district and a block.
There are several states, under each state we have districts and under each district there are blocks.
I tried to implement the following code. I was able to select the state, but there seems to be some error when I select the district.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Chrome()
url = ("https://secc.gov.in/lgdStateList")
browser.get(url)
html_source = browser.page_source
browser.quit()
soup = BeautifulSoup(html_source, 'html.parser')
for name_list in soup.find_all(class_ ='dropdown-row'):
print(name_list.text)
driver = webdriver.Chrome()
driver.get('https://secc.gov.in/lgdStateList')
selectState = Select(driver.find_element_by_id("lgdState"))
for state in selectState.options:
state.click()
selectDistrict = Select(driver.find_element_by_id("lgdDistrict"))
for district in selectDistrict.options:
district.click()
selectBlock = Select(driver.find_element_by_id("lgdBlock"))
for block in selectBlock.options():
block.click()
The error I ran into is :
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="lgdDistrict"]"}
(Session info: chrome=83.0.4103.106)
I need help crawling through the 3 menus.
Any help/suggestions would be really appreciated. Let me know of any clarifications in the comments.
This is where you can find the value of different states. You can find the same from district and block dropdowns.
You should now use those values within payload to get the table you would like to grab data from:
import urllib3
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
link = "https://secc.gov.in/lgdGpList"
payload = {
'stateCode': '10',
'districtCode': '188',
'blockCode': '1624'
}
r = requests.post(link,data=payload,verify=False)
soup = BeautifulSoup(r.text,"html.parser")
for items in soup.select("table#example tr"):
data = [' '.join(item.text.split()) for item in items.select("th,td")]
print(data)
Output the script produces:
['Select State', 'Select District', 'Select Block']
['', 'Select District', 'Select Block']
['ARARIA BASTI (93638)', 'BANGAMA (93639)', 'BANSBARI (93640)']
['BASANTPUR (93641)', 'BATURBARI (93642)', 'BELWA (93643)']
['BOCHI (93644)', 'CHANDRADEI (93645)', 'CHATAR (93646)']
['CHIKANI (93647)', 'DIYARI (93648)', 'GAINRHA (93649)']
['GAIYARI (93650)', 'HARIA (93651)', 'HAYATPUR (93652)']
['JAMUA (93653)', 'JHAMTA (93654)', 'KAMALDAHA (93655)']
['KISMAT KHAWASPUR (93656)', 'KUSIYAR GAWON (93657)', 'MADANPUR EAST (93658)']
['MADANPUR WEST (93659)', 'PAIKTOLA (93660)', 'POKHARIA (93661)']
['RAMPUR KODARKATTI (93662)', 'RAMPUR MOHANPUR EAST (93663)', 'RAMPUR MOHANPUR WEST (93664)']
['SAHASMAL (93665)', 'SHARANPUR (93666)', 'TARAUNA BHOJPUR (93667)']
You need to scrape the numbers available in brackets adjacent to each results above and then use them in payload and send another post requests to download the pdf files. Make sure to put the script in a folder before execution so that you can get all the files within.
import urllib3
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
link = "https://secc.gov.in/lgdGpList"
download_link = "https://secc.gov.in/downloadLgdwisePdfFile"
payload = {
'stateCode': '10',
'districtCode': '188',
'blockCode': '1624'
}
r = requests.post(link,data=payload,verify=False)
soup = BeautifulSoup(r.text,"html.parser")
for item in soup.select("table#example td > a[onclick^='downloadLgdFile']"):
gp_code = item.text.strip().split("(")[1].split(")")[0]
payload['gpCode'] = gp_code
with open(f'{gp_code}.pdf','wb') as f:
f.write(requests.post(download_link,data=payload,verify=False).content)

CNN Scraper sporadically working in python

I've tried to create a Web Scraper for CNN. My goal is to scrape all news articles within the search query. Sometimes I get an output for some of the scraped pages and sometimes it doesn't work at all.
I am using selenium and BeautifulSoup packages in Jupiter Notebook. I am iterating over the pages via the url parameters &page={}&from={}. I tried by.XPATH before and simply clicking the next button at the end of the page, but it gave me the same results.
Here's the code I'm using:
#0 ------------import libraries
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import feedparser
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle
import pandas as pd
#3 ------------CNN SCRAPER
#3.1 ----------Define Funktion
def CNN_Scraper(max_pages):
base = "https://edition.cnn.com/"
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
load_content = browser.implicitly_wait(30)
base_url = 'https://edition.cnn.com/search?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100'
#-------------Define empty lists to be scraped
CNN_title = []
CNN_date = []
CNN_article = []
article_count = 0
#-------------iterate over pages and extract
for page in range(1, max_pages + 1):
print("Page %d" % page)
url= base_url + "&page=%d&from=%d" % (page, article_count)
browser.get(url)
load_content
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('div', {'class':'cnn-search__results-list'})
contents = search_results.find_all('div', {'class':'cnn-search__result-contents'})
for content in contents:
try:
title = content.find('h3').text
print(title)
link = content.find('a')
link_url = link['href']
date = content.find('div',{'class':'cnn-search__result-publish-date'}).text.strip()
article = content.find('div',{'class':'cnn-search__result-body'}).text
except:
print("loser")
continue
CNN_title.append(title)
CNN_date.append(date)
CNN_article.append(article)
article_count += 100
print("-----")
#-------------Save in DF
df = pd.DataFrame()
df['title'] = CNN_title
df['date'] = CNN_date
df['article'] = CNN_article
df['link']=CNN_link
return df
#print("Complete")
browser.quit()
#3.2 ----------Call Function - Scrape CNN and save pickled data
CNN_data = CNN_Scraper(2)
#CNN_data.to_pickle("CNN_data")
Call the back-end API directly. For more details check my previous answer
import requests
import json
def main(url):
with requests.Session() as req:
for item in range(1, 1000, 100):
r = req.get(url.format(item)).json()
for a in r['result']:
print("Headline: {}, Url: {}".format(
a['headline'], a['url']))
main("https://search.api.cnn.io/content?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}")

I am Scraping multiple web pages which gives the same results as the first page in Python selenium. What would be the reason?

I am scraping goodreads.com using Selenium and Beautiful soup. I can able to get the results for the first page. When I give the URL for the second page then it loads the first page and gives the first page results only. I tried with different pages and all loads the first page only. What would be the reason and how to overcome this?
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import pandas as pd
import time
import re
import requests
from itertools import zip_longest
from webdriver_manager.chrome import ChromeDriverManager
# First-page site URL: https://www.goodreads.com/shelf/show/business?page=1
driver = webdriver.Chrome(ChromeDriverManager().install())
# Reading the second page
driver.get("https://www.goodreads.com/shelf/show/non-fiction?page=2")
time.sleep(3)
summaryItems = driver.find_elements_by_xpath("//a[contains(#class, 'bookTitle')]")
job_links = [summaryItem.get_attribute("href") for summaryItem in summaryItems]
for job_link in job_links:
driver.get(job_link)
#Closing the pop-up window
try:
close = driver.find_elements_by_class_name('gr-iconButton')
close.click()
except:
close = "None"
try:
# Taking book description
more = driver.find_element_by_css_selector("#description > a:nth-child(3)").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
#for item in soup.findAll("span", id=re.compile("^freeText"))[:2]:
# print(item.text)
sections = soup.findAll("span", id=re.compile("^freeText"))[:2]
print("message ")
i = 0
for item in soup.findAll("span", id=re.compile("^freeText"))[:2]:
i = i+1
if i == 2:
desc.append(item.text)
except:
more = "None"
try: # Taking book title
# time.sleep(2)
job_title = driver.find_element_by_xpath("//h1[#class='gr-h1 gr-h1--serif']").text
#job_title = driver.find_element_by_id('bookTitle').find_element_by_class_name('gr-h1 gr-h1--serif').text
title.append(job_title)
#print(title)
except:
job_title = "None"
#Taking Author name
try:
# time.sleep(2)
authors = driver.find_element_by_xpath("//a[#class='authorName']").text
author.append(authors)
#print(author)
except:
authors = "None"
#Taking Ratings
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
rate = soup.find("span", itemprop="ratingValue").text.strip()
rates = rate.replace('\n','')
rating.append(rates)
driver.close()
Output:
I am able to scrape book title, author name, book description, and rating for the first page only.
You should login first to scrap data on other page.
Try to add following code into your script:
driver = webdriver.Chrome(ChromeDriverManager().install())
# Add below code after webdriver.Chrome()
driver.get("https://www.goodreads.com/user/sign_in")
time.sleep(5)
driver.find_element_by_css_selector("#user_email").send_keys("your email")
driver.find_element_by_css_selector("#user_password").send_keys("your password")
driver.find_element_by_xpath("//input[#type='submit' and #value='Sign in']").click()

How do I force my code to carry out the next for loop?

My code is stopping short before finishing all the tasks.
It should be:
1 - getting a link from search results of fitness classes to go to the individual studio page.
2 - then from the individual studio page(first for loop):
A) grab the studio name and write to csv.
B) grab a link to a fitness class from the class schedule
3 - Open class page link and grab class name (second for loop)
It completes step 2 and instead of continuing to step 3, it goes back to initial search results page and repeats step 1 for the next studio in order.
What am i doing wrong? Thanks in advance!
from selenium import webdriver
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as browser_wait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import re
import csv
# initialize the chrome browser
browser = webdriver.Chrome(executable_path=r'./chromedriver')
# URL
class_pass_url = 'https://www.classpass.com'
# Create file and writes the first row, added encoding type as write was giving errors
f = open('ClassPass.csv', 'w', encoding='utf-8')
headers = 'Studio, Name, Description, Image, Address, Phone, Email, Website\n'
f.write(headers)
# classpass results page
page = "https://classpass.com/search/e8-4rb/fitness-classes/58PHLz8oWT9"
browser.get(page)
# Browser waits
browser_wait(browser, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "line")))
# Scrolls to bottom of page to reveal all classes
# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Extract page source and parse
page_source = browser.page_source
page_soup = soup(page_source, "html.parser")
# Parse of class listings # Looks through results and gets link to class page
sessions = page_soup.findAll('li', {'class': '_3vk1F9nlSJQIGcIG420bsK'})
for session in sessions:
# gets link to class page and
session_link = class_pass_url + session.a['href']
browser.get(session_link)
browser_wait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1ruz3nW6mOnylv99BOA_tm')))
# parses class page
session_page_source = browser.page_source
session_soup = soup(session_page_source, "html.parser")
# get studio name
try:
studio = session_soup.find('h2', {'class': 'gamma'}).text
except (AttributeError, TypeError,) as e:
pass
# write studio name
f.write(
studio.replace(',', '|') + "\n")
print('got studio name name')
# gets link to individual class in classes schedule table
classses = page_soup.findAll('section', {'class': '_33uV0qMCu2Sfk4M3oTJjVv'})
for classs in classses:
classs_link = class_pass_url + classs.a['href']
browser.get(classs_link)
browser_wait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1ruz3nW6mOnylv99BOA_tm')))
# parses individual class page
classses_page_source = browser.page_source
classses_soup = soup(classses_page_source, "html.parser")
try:
classs_name = session_soup.find('span', {'data-component': 'LocalizableMessage'}).text
except (AttributeError, TypeError,) as e:
pass
# gets class names
f.write(
classs_name.replace(',', '|') + "\n")
print('got class name')
I'm not quite sure about your goal since your question and your code is completely unexplained well.
But from my point of view, i think that's your goal.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
def Main():
r = requests.get(
"https://classpass.com/search/e8-4rb/fitness-classes/58PHLz8oWT9")
soup = BeautifulSoup(r.text, 'html.parser')
urls = []
for item in soup.findAll("a", {'class': '_3Rgmjog5fetGEXICK2gVhh'}):
item = item.get("href")
urls.append(f"https://classpass.com{item}")
return urls
options = Options()
options.add_argument('--headless')
def Second():
urls = Main()
studios = []
links = []
driver = webdriver.Firefox(options=options)
for url in urls:
print(f"Extracting: {url}")
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
studio = soup.find('h2', {'class': 'gamma'}).text
studios.append(studio)
for item in soup.findAll("a", {'href': True}):
item = item.get("href")
if item.startswith("/classes/"):
print(item)
links.append(f"https://www.classpass.com{item}")
driver.quit()
return links
def Third():
links = Second()
driver = webdriver.Firefox(options=options)
for link in links:
driver.get(link)
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
name = soup.find(
'span', {'data-component': 'LocalizableMessage'}).text
print(name)
except:
pass
driver.quit()
Third()

Categories

Resources