How do I continuously scrape from a website?

How do I continuously scrape from a website? - python

How would I be able to continuously and automatically scrape from a website. For example get the trending quotes every 6 hours from the tsx website and update my JSON file according? In other words, continuously execute my python script every 6 hours automatically
Reason behind this, I will be using my JSON file in my own personal website to output the data, styled in html and css. So whenever someone comes to my site, the content would have updated from the previous visit, if the tsx trending quotes updated.
# grabs all the trending quotes for that day
def getTrendingQuotes(browser):
# wait until trending links appear, not really needed only for example
all_trendingQuotes = WebDriverWait(browser, 10).until(
lambda d: d.find_elements_by_css_selector('#trendingQuotes a')
)
return [link.get_attribute('href') for link in all_trendingQuotes]
def getStockDetails(url, browser):
print(url)
browser.get(url)
quote_wrapper = browser.find_element_by_css_selector('div.quote-wrapper')
quote_name = quote_wrapper.find_element_by_class_name(
"quote-name").find_element_by_tag_name('h2').text
quote_price = quote_wrapper.find_element_by_class_name("quote-price").text
quote_volume = quote_wrapper.find_element_by_class_name(
"quote-volume").text
print("\n")
print("Quote Name: " + quote_name)
print("Quote Price: " + quote_price)
print("Quote Volume: " + quote_volume)
print("\n")
convertToJson(quote_name, quote_price, quote_volume, url)
quotesArr = []
# Convert to a JSON file
def convertToJson(quote_name, quote_price, quote_volume, url):
quoteObject = {
"url": url,
"Name": quote_name,
"Price": quote_price,
"Volume": quote_volume
}
quotesArr.append(quoteObject)
def trendingBot(url, browser):
browser.get(url)
trending = getTrendingQuotes(browser)
for trend in trending:
getStockDetails(trend, browser)
# requests finished, write json to file
with open('trendingQuoteData.json', 'w') as outfile:
json.dump(quotesArr, outfile)
def Main():
chrome_options = Options()
chrome_options.add_argument("--headless")
# applicable to windows os only
chrome_options.add_argument('--disable-gpu')
url = 'https://www.tmxmoney.com/en/index.html'
browser = webdriver.Chrome(
r"C:\Users\austi\OneDrive\Desktop\chromeDriver\chromedriver_win32\chromedriver.exe", chrome_options=chrome_options)
browser.get(url)
os.system('cls')
print("[+] Success! Bot Starting!")
trendingBot(url, browser)
# trendingBot(browser)
browser.quit()
if __name__ == "__main__":
Main()

Related

Proxycurl api doesnt return data properly

first, i'm sorry for this long message but i have an issue that is blocking me from advancing in my project: Let me first explain quickly the workflow, User enters a search query -> making a search in linkedin with this query -> grabbing urls of users (in function of nb of pages) -> search for these users in proxycurl (https://nubela.co/proxycurl/docs#people-api-person-lookup-endpoint) -> grab their infos with a function -> store them in my db -> grabs infos about the experiences of the scraped users -> make a search in proxycurl API again but for the companies this time -> grab infos about companies and store them in db -> search infos about employees in this company (https://nubela.co/proxycurl/docs#company-api-employee-search-api-endpoint) -> grab url of the CTO -> search in the contact API to grab the infos about the CTO (https://nubela.co/proxycurl/docs#contact-api-personal-contact-number-lookup-endpoint and https://nubela.co/proxycurl/docs#contact-api-personal-email-lookup-endpoint) -> store everything in database.
Ok so i manage to grab urls, search for the users in api, but i never manage to get the 'extra' information with my code while i can grab them for the same profiles in Postman, same for personnal_email, personnal_contact_number, github_profile_id.
Then i manage to grab the data about the companies, but still same problem, can't retrieve the 'extra' information, or the 'funding_data' or 'acquisitions' even if i include them in my code.
I really don't know what's wrong with my code (i'm assuming something's wrong because everything works perfectly with postman), and i can take a little help here, thanks for your time ! (Full code below)
from telnetlib import EC
import requests
from datetime import datetime
import json
import re
from cgitb import text
import selenium
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup, NavigableString, Tag
from time import sleep
from time import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import csv
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
import openpyxl
import requests
cred = credentials.Certificate(r"C:\Users\radia\Downloads\st-londres-2-firebase-adminsdk-7eowq-786e799875.json")
firebase_admin.initialize_app(cred, {
'databaseURL': 'https://st-londres-2-default-rtdb.firebaseio.com/'
})
print('- Importation des packages')
# Task 1: webdriver configuration
driver = webdriver.Chrome(ChromeDriverManager().install())
# Task 1.1: Open Chrome and Access Linkedin
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
print('Initialisation du chrome driver')
sleep(2)
# Task 1.2: Import username and password
credential = open(r"C:\Users\radia\OneDrive\Bureau\credentials.txt")
line = credential.readlines()
username = line[0]
password = line[1]
print('Importation des id')
sleep(2)
# Task 1.2: Key in login credentials
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
print('Email ok')
sleep(3)
password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
print('Mdp ok')
sleep(2)
# Task 1.2: Click the Login button
signin_field = driver.find_element(By.XPATH, '//*[#id="organic-div"]/form/div[3]/button')
signin_field.click()
sleep(3)
print('- Task A: Connexion à Linkedin')
search_field = driver.find_element(By.XPATH, '//*[#id="global-nav-typeahead"]/input')
search_query = input('Type of profile to scrape ')
search_field.send_keys(search_query)
search_field.send_keys(Keys.RETURN)
print('TASK B OK')
sleep(10)
try:
driver.find_element(By.XPATH, "//*[#id='search-reusables__filters-bar']/ul/li[2]/button").click()
except selenium.common.exceptions.NoSuchElementException:
print("Element not found")
def GetURL(): #function to grab linkedin urls
page_source = BeautifulSoup(driver.page_source, features='lxml')
a_elements = page_source.find_all('a', {'class': "app-aware-link"})
all_urls = []
for element in a_elements:
url = element.get('href')
all_urls.append(url)
return all_urls
##Pagination
sleep(2)
input_page = int(input('Nombre de pages à scraper: '))
URLs_all_page = []
for page in range(input_page):
URLs_one_page = GetURL()
sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scrolling to the end of the page
sleep(3)
next_button = driver.find_element(By.XPATH, '//button[contains(#class, "artdeco-pagination__button--next") and .//li-icon]')
driver.execute_script("arguments[0].click();", next_button)
sleep(2)
if URLs_one_page is not None:
URLs_all_page = URLs_all_page + URLs_one_page
print(URLs_all_page)
else:
print('variable stores a None value')
sleep(2)
print(URLs_all_page)
sleep(1)
def get_profile_info(url): # function to make api calls for users
api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
params = {
'url': url,
'fallback_to_cache': 'on-error',
'use_cache': 'if-present',
'skills': 'include',
'inferred_salary': 'include',
'personal_email': 'include',
'personal_contact_number': 'include',
'twitter_profile_id': 'include',
'facebook_profile_id': 'include',
'github_profile_id': 'include',
'extra': 'include',
}
try:
response = requests.get(api_endpoint, headers=headers, params=params)
if response.status_code != 404:
data_profile = response.json()
return data_profile
else:
return None
except requests.exceptions.RequestException as e:
print (e)
return None
def get_company_info(url): #function to make api calls for companies
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company'
params = {
'resolve_numeric_id': 'true',
'categories': 'include',
'funding_data': 'include',
'extra': 'include',
'exit_data': 'include',
'acquisitions': 'include',
'url': 'include',
'use_cache': 'if-present',
}
try:
response = requests.get(api_endpoint, params={'url':url}, headers=headers)
if response.status_code == 404:
print("Company not found for URL:", url)
return None
else:
data_company = response.json()
print(data_company)
if 'extra' in data_company:
print("Extra information found:", data_company['extra'])
else:
print("No extra information found in JSON response.")
return data_company
except requests.exceptions.RequestException as e:
print (e)
return None
def get_company_employee_url(company_linkedin_profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company/employee/search/'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'page_size': '10',
'linkedin_company_profile_url': company_linkedin_profile_url,
'keyword_regex': '[Cc][Tt][Oo]',
'enrich_profiles': 'enrich',
'resolve_numeric_id': 'false',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
print(response.status_code)
print(response.text)
if response.status_code == 404:
print("No employees found for URL:", url)
return None
else:
data_employees = response.json()
if 'employees' in data_employees:
print("Employees found:", data_employees['employee_search_results'])
else:
print("No employees found in JSON response.")
#return and store profile_url in data_employees:
for employee in data_employees['employee_search_results']:
profile_url = employee['profile_url']
print(profile_url)
def get_company_employee_info(profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/contact-api/personal-contact'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'linkedin_profile_url': 'https://linkedin.com/in/test-phone-number',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
# Initialize visited URLs + data_list
visited_urls = []
for url in URLs_all_page:
if url in visited_urls:
print("Profile already exists in the database for URL:", url)
continue
data = get_profile_info(url)
if data and "error" in data:
print(data["error"])
if not data or "experiences" not in data:
continue
data["search_query"] = search_query # Add the search_query to the data
db.reference('profiles').push(data) # Store data in the candidates table
visited_urls.append(url)
print("Profile data and search query successfully added to the candidates table for URL:", url)
for item in data['experiences']:
company_name = str(item['company'])
company_name_push = re.sub(r'[^a-zA-Z0-9]', '', company_name) # Error handling when pushing code to db, replacement of illegal values
company_linkedin_profile_url = item['company_linkedin_profile_url']
company_description = item['description']
company_data = get_company_info(company_linkedin_profile_url)
if company_name_push:
filtered_company = db.reference('companies/'+ company_name_push).get()
else:
continue
if filtered_company is None:
db.reference('companies').push({
'company_name': company_name_push,
'company_linkedin_profile_url': company_linkedin_profile_url,
'company_description': company_description,
'company_data': company_data
})
print("Company data successfully added for URL:", company_linkedin_profile_url)
else:
print("Company already exists in the database for URL:", company_linkedin_profile_url)
experiences = {
'candidate_name': data['full_name'],
'title': item['title'],
'company': item['company'],
'location': item['location'],
'start_date': item['starts_at'],
'end_date': item['ends_at'],
'description': item['description'],
}
db.reference('experiences').push(experiences)
company_employee_url = get_company_employee_url(company_linkedin_profile_url)
company_employee_data = get_company_employee_info(company_employee_url)
if company_employee_data:
db.reference('company_employees/' + company_name_push).push(company_employee_data)
print("Company employee data successfully added for company:", company_name)
else:
print("No data found for company employees for company:", company_name)

How can I send Dynamic website content to scrapy with the html content generated by selenium browser?

I am working on certain stock-related projects where I have had a task to scrape all data on a daily basis for the last 5 years. i.e from 2016 to date. I particularly thought of using selenium because I can use crawler and bot to scrape the data based on the date. So I used the use of button click with selenium and now I want the same data that is displayed by the selenium browser to be fed by scrappy.
This is the website I am working on right now.
I have written the following code inside scrappy spider.
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
def start_requests(self):
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
floorsheet_dates = ['01/03/2016','01/04/2016', up to till date '01/10/2022']
for date in floorsheet_dates:
driver.get(
"https://merolagani.com/Floorsheet.aspx")
driver.find_element(By.XPATH, "//input[#name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
driver.find_element(By.XPATH, "(//a[#title='Search'])[3]").click()
total_length = driver.find_element(By.XPATH,
"//span[#id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(z, z + 1):
driver.find_element(By.XPATH, "(//a[#title='Page {}'])[2]".format(data)).click()
self.url = driver.page_source
yield Request(url=self.url, callback=self.parse)
def parse(self, response, **kwargs):
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
Update: Error after answer is
2022-01-14 14:11:36 [twisted] CRITICAL:
Traceback (most recent call last):
File "/home/navaraj/PycharmProjects/first_scrapy/env/lib/python3.8/site-packages/twisted/internet/defer.py", line 1661, in _inlineCallbacks
result = current_context.run(gen.send, result)
File "/home/navaraj/PycharmProjects/first_scrapy/env/lib/python3.8/site-packages/scrapy/crawler.py", line 88, in crawl
start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable
I want to send current web html content to scrapy feeder but I am getting unusal error for past 2 days any help or suggestions will be very much appreciated.

The 2 solutions are not very different. Solution #2 fits better to your question, but choose whatever you prefer.
Solution 1 - create a response with the html's body from the driver and scraping it right away (you can also pass it as an argument to a function):
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from scrapy.http import HtmlResponse
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
def start_requests(self):
# driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver = webdriver.Chrome()
floorsheet_dates = ['01/03/2016','01/04/2016']#, up to till date '01/10/2022']
for date in floorsheet_dates:
driver.get(
"https://merolagani.com/Floorsheet.aspx")
driver.find_element(By.XPATH, "//input[#name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
driver.find_element(By.XPATH, "(//a[#title='Search'])[3]").click()
total_length = driver.find_element(By.XPATH,
"//span[#id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(1, z + 1):
driver.find_element(By.XPATH, "(//a[#title='Page {}'])[2]".format(data)).click()
self.body = driver.page_source
response = HtmlResponse(url=driver.current_url, body=self.body, encoding='utf-8')
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
# return an empty requests list
return []
Solution 2 - with super simple downloader middleware:
(You might have a delay here in parse method so be patient).
import scrapy
from scrapy import Request
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.common.by import By
class SeleniumMiddleware(object):
def process_request(self, request, spider):
url = spider.driver.current_url
body = spider.driver.page_source
return HtmlResponse(url=url, body=body, encoding='utf-8', request=request)
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'tempbuffer.spiders.yetanotherspider.SeleniumMiddleware': 543,
# 'projects_name.path.to.your.pipeline': 543
}
}
driver = webdriver.Chrome()
def start_requests(self):
# driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
floorsheet_dates = ['01/03/2016','01/04/2016']#, up to till date '01/10/2022']
for date in floorsheet_dates:
self.driver.get(
"https://merolagani.com/Floorsheet.aspx")
self.driver.find_element(By.XPATH, "//input[#name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
self.driver.find_element(By.XPATH, "(//a[#title='Search'])[3]").click()
total_length = self.driver.find_element(By.XPATH,
"//span[#id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(1, z + 1):
self.driver.find_element(By.XPATH, "(//a[#title='Page {}'])[2]".format(data)).click()
self.body = self.driver.page_source
self.url = self.driver.current_url
yield Request(url=self.url, callback=self.parse, dont_filter=True)
def parse(self, response, **kwargs):
print('test ok')
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
Notice that I've used chrome so change it back to firefox like in your original code.

How to parse links in HTML page?

I want to parse list of links from this website
I am trying to do this with request library in Python. However, when I try to read the HTML with bs4 there aren't any links. Just empty ul:
< ul class="ais-Hits-list">< /ul >
How can I get these links?
Edit:
The code I tried so far:
link = "https://www.over-view.com/digital-index/"
r = requests.get(link)
soup = BeautifulSoup(r.content, 'lxml')

since information load dynamically on that website, you can use selenium to collect required information:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--window-size=1920x1080")
path_to_chromedriver ='chromedriver'
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=path_to_chromedriver)
driver.get('https://www.over-view.com/digital-index/')
time.sleep(5)
soup = BeautifulSoup(driver.page_source, "lxml")
rows = soup.select("ul.ais-Hits-list > li > a")
for row in rows:
print(row.get('href'))
Example of output:
/overviews/adelaide-canola-flowers
/overviews/adelaide-rift-complex
/overviews/adriatic-tankers
/overviews/adventuredome
/overviews/agricultural-development
/overviews/agricultural-development
/overviews/agricultural-development
/overviews/agriculture-development

There is also a bit more extravagant way: don't judge to harsh since I've tried this approach for the first time, but you can make the same request to the API as their frontend does. Plus, this code executes asynchronously thanks to asyncio + aiohttp.
Keep in mind, that I took arbitrary number of pages to iterate over and didn't handle possible errors (you need to fine tune it).
Code without Selenium WebDriver
import json
import asyncio
import aiohttp
URL = "https://ai7o5ij8d5-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia for JavaScript (3.35.1); Browser (lite); react (16.13.1); react-instantsearch (5.7.0); JS Helper (2.28.0)&x-algolia-application-id=AI7O5IJ8D5&x-algolia-api-key=7f1a509e834f885835edcfd3482b990c"
async def scan_single_digital_index_page(page_num, session):
body = {
"requests": [
{
"indexName": "overview",
"params": f"query=&hitsPerPage=30&maxValuesPerFacet=10&page={page_num}&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facets=%5B%22_tags.name%22%5D&tagFilters=",
}
]
}
async with session.post(URL, json=body) as resp:
received_data = await resp.json()
results = received_data.get("results")
hits = results[0].get("hits")
links = list()
for hit in hits:
for key, value in hit.items():
if key == "slug":
links.append("https://www.over-view.com/overviews/" + value)
return links
async def scan_all_digital_index_pages(session):
tasks = list()
max_pages = 20
for page_num in range(1, max_pages):
task = asyncio.create_task(scan_single_digital_index_page(page_num, session))
tasks.append(task)
all_lists = await asyncio.gather(*tasks)
# Unpack all lists with links into a single set of all links.
all_links = set()
for l in all_lists:
all_links.update(l)
return all_links
async def main():
async with aiohttp.ClientSession() as session:
all_links = await scan_all_digital_index_pages(session)
for link in all_links:
print(link)
if __name__ == "__main__":
asyncio.run(main())
Example result for the first page
https://www.over-view.com/overviews/adelaide-canola-flowers
https://www.over-view.com/overviews/adelaide-rift-complex
https://www.over-view.com/overviews/adriatic-tankers
https://www.over-view.com/overviews/adventuredome
https://www.over-view.com/overviews/agricultural-development
https://www.over-view.com/overviews/agricultural-development
https://www.over-view.com/overviews/agricultural-development
https://www.over-view.com/overviews/agriculture-development
https://www.over-view.com/overviews/akimiski-island
https://www.over-view.com/overviews/al-falah-housing-project
https://www.over-view.com/overviews/alabama-tornadoes
https://www.over-view.com/overviews/alakol-lake
https://www.over-view.com/overviews/albenga
https://www.over-view.com/overviews/albuquerque-baseball-complex
https://www.over-view.com/overviews/alta-wind-energy-center
https://www.over-view.com/overviews/altocumulus-clouds
https://www.over-view.com/overviews/amsterdam
https://www.over-view.com/overviews/anak-krakatoa-eruption-juxtapose
https://www.over-view.com/overviews/ancient-ruins-of-palmyra
https://www.over-view.com/overviews/andean-mountain-vineyards
https://www.over-view.com/overviews/angas-inlet-trees
https://www.over-view.com/overviews/angkor-wat
https://www.over-view.com/overviews/ankara-residential-development
https://www.over-view.com/overviews/antofagasta-chile
https://www.over-view.com/overviews/apple-park
https://www.over-view.com/overviews/aquatica-water-park
https://www.over-view.com/overviews/aral-sea
https://www.over-view.com/overviews/arc-de-triomphe
https://www.over-view.com/overviews/arecibo-observatory
https://www.over-view.com/overviews/arizona-rock-formations
For future changes (as there are many moving parts) you can get the info about their API from Web Console in browser 👇

How do I force my code to carry out the next for loop?

My code is stopping short before finishing all the tasks.
It should be:
1 - getting a link from search results of fitness classes to go to the individual studio page.
2 - then from the individual studio page(first for loop):
A) grab the studio name and write to csv.
B) grab a link to a fitness class from the class schedule
3 - Open class page link and grab class name (second for loop)
It completes step 2 and instead of continuing to step 3, it goes back to initial search results page and repeats step 1 for the next studio in order.
What am i doing wrong? Thanks in advance!
from selenium import webdriver
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as browser_wait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import re
import csv
# initialize the chrome browser
browser = webdriver.Chrome(executable_path=r'./chromedriver')
# URL
class_pass_url = 'https://www.classpass.com'
# Create file and writes the first row, added encoding type as write was giving errors
f = open('ClassPass.csv', 'w', encoding='utf-8')
headers = 'Studio, Name, Description, Image, Address, Phone, Email, Website\n'
f.write(headers)
# classpass results page
page = "https://classpass.com/search/e8-4rb/fitness-classes/58PHLz8oWT9"
browser.get(page)
# Browser waits
browser_wait(browser, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "line")))
# Scrolls to bottom of page to reveal all classes
# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Extract page source and parse
page_source = browser.page_source
page_soup = soup(page_source, "html.parser")
# Parse of class listings # Looks through results and gets link to class page
sessions = page_soup.findAll('li', {'class': '_3vk1F9nlSJQIGcIG420bsK'})
for session in sessions:
# gets link to class page and
session_link = class_pass_url + session.a['href']
browser.get(session_link)
browser_wait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1ruz3nW6mOnylv99BOA_tm')))
# parses class page
session_page_source = browser.page_source
session_soup = soup(session_page_source, "html.parser")
# get studio name
try:
studio = session_soup.find('h2', {'class': 'gamma'}).text
except (AttributeError, TypeError,) as e:
pass
# write studio name
f.write(
studio.replace(',', '|') + "\n")
print('got studio name name')
# gets link to individual class in classes schedule table
classses = page_soup.findAll('section', {'class': '_33uV0qMCu2Sfk4M3oTJjVv'})
for classs in classses:
classs_link = class_pass_url + classs.a['href']
browser.get(classs_link)
browser_wait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1ruz3nW6mOnylv99BOA_tm')))
# parses individual class page
classses_page_source = browser.page_source
classses_soup = soup(classses_page_source, "html.parser")
try:
classs_name = session_soup.find('span', {'data-component': 'LocalizableMessage'}).text
except (AttributeError, TypeError,) as e:
pass
# gets class names
f.write(
classs_name.replace(',', '|') + "\n")
print('got class name')

I'm not quite sure about your goal since your question and your code is completely unexplained well.
But from my point of view, i think that's your goal.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
def Main():
r = requests.get(
"https://classpass.com/search/e8-4rb/fitness-classes/58PHLz8oWT9")
soup = BeautifulSoup(r.text, 'html.parser')
urls = []
for item in soup.findAll("a", {'class': '_3Rgmjog5fetGEXICK2gVhh'}):
item = item.get("href")
urls.append(f"https://classpass.com{item}")
return urls
options = Options()
options.add_argument('--headless')
def Second():
urls = Main()
studios = []
links = []
driver = webdriver.Firefox(options=options)
for url in urls:
print(f"Extracting: {url}")
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
studio = soup.find('h2', {'class': 'gamma'}).text
studios.append(studio)
for item in soup.findAll("a", {'href': True}):
item = item.get("href")
if item.startswith("/classes/"):
print(item)
links.append(f"https://www.classpass.com{item}")
driver.quit()
return links
def Third():
links = Second()
driver = webdriver.Firefox(options=options)
for link in links:
driver.get(link)
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
name = soup.find(
'span', {'data-component': 'LocalizableMessage'}).text
print(name)
except:
pass
driver.quit()
Third()

crawling about google app store error

I use this code to crawl this code.
the first few come out well
but error
I want your advice.
what can I do?
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup, Comment
import pandas as pd
#Setting up Chrome webdriver Options
#chrome_options = webdriver.ChromeOptions()
#setting up local path of chrome binary file
#chrome_options.binary_location = "/Users/Norefly/chromedriver2/chromedriver.exec"
#creating Chrome webdriver instance with the set chrome_options
driver = webdriver.PhantomJS("C:/Python/phantomjs-2.1.1-windows/bin/phantomjs.exe")
link = "https://play.google.com/store/apps/details?id=com.supercell.clashofclans&hl=en"
driver.get(link)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
Ptitle = driver.find_element_by_class_name('id-app-title').text.replace(' ','')
print(Ptitle)
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]').click()
sleep(1)
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#select_newest.select_by_visible_text('Newest')
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
sleep(2)
#driver.find_element_by_css_selector('.review-filter.id-review-sort-filter.dropdown-menu-container').click()
driver.find_element_by_css_selector('.displayed-child').click()
#driver.find_element_by_xpath("//button[#data-dropdown-value='1']").click()
driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
reviews_df = []
for i in range(1,10000):
try:for elem in driver.find_elements_by_class_name('single-review'):
print(str(i))
content = elem.get_attribute('outerHTML')
soup = BeautifulSoup(content, "html.parser")
#print(soup.prettify())
date = soup.find('span',class_='review-date').get_text()
rating = soup.find('div',class_='tiny-star')['aria-label'][6:7]
title = soup.find('span',class_='review-title').get_text()
txt = soup.find('div',class_='review-body').get_text().replace('Full Review','')[len(title)+1:]
print(soup.get_text())
temp = pd.DataFrame({'Date':date,'Rating':rating,'Review
Title':title,'Review Text':txt},index=[0])
print('-'*10)
reviews_df.append(temp)
#print(elem)
except:
print('s')
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
reviews_df = pd.concat(reviews_df,ignore_index=True)
reviews_df.to_csv(Ptitle+'review_google.csv', encoding='utf-8')
#driver.close()
This error occurred during the crawl but i don't understand this error.
The operating system is Windows, I'm analyzing it with Python and use phantomjs.
*google playstore crawling
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.ElementNotVisibleException:
Message: {
"errorMessage": "Element is not currently visible and may not be manipulated",
"request":{
"headers":{
"Accept":"application/json",
"Accept-Encoding":"identity",
"Connection":"close",
"Content-Length":"81",
"Content-Type":"application/json; charset=UTF-8",
"Host":"127.0.0.1:58041",
"User-Agent":"Python http auth"
},
"httpVersion":"1.1",
"method":"POST",
"post":"{
\"id\": \":wdc:1505360987512\",
\"sessionId\": \"b7c59070-98ff-11e7-8363-fdfc8cdfd230 \"
}",
"url":"/click",
"urlParsed": {
"anchor":"",
"query":"",
"file":"click",
"directory":"/",
"path":" /click",
"relative":" /click",
"port":"",
"host":"",
"password":"",
"user":"",
"userInfo":"",
"authority":" ",
"protocol":"",
"source":"/click",
"queryKey":{},
"chunks": ["click"]
},
"urlOriginal":"/session/b7c59070-98ff-11e7-8363-fdfc8cdfd230/element /:wdc:1505360987512/click"
}
}
Screenshot: available via screen

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How do I continuously scrape from a website? - python

Related

Proxycurl api doesnt return data properly

How can I send Dynamic website content to scrapy with the html content generated by selenium browser?

How to parse links in HTML page?

How do I force my code to carry out the next for loop?

crawling about google app store error

Categories

Resources