Python Selenium scrape data when button "Load More" doesnt change URL - python

I am using the following code to attempt to keep clicking a "Load More" button until all page results are shown on the website:
from selenium import webdriver
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def startWebDriver():
global driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(options = chrome_options)
startWebDriver()
driver.get("https://together.bunq.com/all")
time.sleep(4)
while True:
try:
wait = WebDriverWait(driver, 10,10)
element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "[title='Load More']")))
element.click()
print("Loading more page results")
except:
print("All page results displayed")
break;
However, since the button click does not change the URL, no new data is loaded into chromedriver and the while loop will break on the second iteration.

Selenium is overkill for this. You only need requests. Logging one's network traffic reveals that at some point JavaScript makes an XHR HTTP GET request to a REST API endpoint, the response of which is JSON and contains all the information you're likely to want to scrape.
One of the query-string parameters for that endpoint URL is page[offset], which is used to offset the query results for pagination (in this case the "load more button"). A value of 0 corresponds to no offset, or "start at the beginning". Increment this value to suit your needs - in a loop would probably be a good place to do this.
Simply imitate that XHR HTTP GET request - copy the API endpoint URL and query-string parameters and request headers, then parse the JSON response:
def get_discussions():
import requests
url = "https://together.bunq.com/api/discussions"
params = {
"include": "user,lastPostedUser,tags,firstPost",
"page[offset]": 0
}
headers = {
"user-agent": "Mozilla/5.0"
}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
yield from response.json()["data"]
def main():
for discussion in get_discussions():
print(discussion["attributes"]["title"])
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Output:
⚡️What’s new in App Update 18.8.0
Local Currencies Accounts Fees
Local Currencies 💸
Spanish IBANs Service Coverage 🇪🇸
bunq Update 18 FAQ 📚
Phishing and Spoofing - The new ways of scamming, explained 👀
Easily rent a car with your bunq credit card 🚗
Giveaway - Hallo Deutschland! 🇩🇪
Giveaway - Hello Germany! 🇩🇪
True Name: Feedback 💬
True Name 🌈
What plans are available?
Everything about refunds 💸
Identity verification explained! 🤳
When will I receive my payment?
Together Community Guidelines 📣
What is my Tax Identification Number (TIN)?
How do I export a bank statement?
How do I change my contact info?
Large cash withdrael
If this is a new concept for you, I would suggest you look up tutorials on how to use your browser's developer tools (Google Chrome's Devtools, for example), how to log your browser's network traffic, REST APIs, HTTP, etc.

Related

selenium python iframe get https requests

I'm using selenium with python, I want to get all https requests from iframe element.
here I get the iframe element and after I select a row from table and press button, http post request will start.
part of my code
define chrome driver
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--auto-open-devtools-for-tabs')
chrome_options.add_argument('--log-level=2')
chrome_options.add_argument('--disable-features=IsolateOrigins,site-per-process')
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL",'browser':'ALL','server':'ALL'} # chromedriver 75+
capabilities["goog:chromeOptions"] = {"w3c": "false","args": "%w[headless window-size=1280,800]"} # chromedriver 75+
capabilities['acceptSslCerts'] = True
capabilities['acceptInsecureCerts'] = True
capabilities['PageLoadStrategy'] = None
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options,desired_capabilities=capabilities)
driver.get(os.environ['URL'])
get iframe element and click on row table
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, '//*[#id="myiframe"]')))
row_selector = '//*[#id="root"]/div/div/div/div[2]/div[2]/div/div/div/div/table/tbody/tr[1]/th[3]'
row_selector_clickable = '//*[#id="root"]/div/div/div/div[2]/div[2]/div/div/div/div/table/tbody/tr[1]/th[3]/div/div/div[2]/div/button'
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, row_selector)))
actions = ActionChains(driver)
row_element = driver.find_element(By.XPATH, row_selector)
row_clickable = driver.find_element(By.XPATH, row_selector_clickable)
actions.move_to_element(row_element)
actions.click(row_clickable)
actions.perform()
then here, I get all http post requests and write them to file
logs = driver.get_log("performance")
def process_browser_logs_for_network_events(logs):
"""
Return only logs which have a method that start with "Network.response", "Network.request", or "Network.webSocket"
since we're interested in the network events specifically.
"""
for entry in logs:
log = json.loads(entry["message"])["message"]
yield log
events = process_browser_logs_for_network_events(logs)
li = []
with open(f"log_entries-{datetime.datetime.now()}.txt", "wt") as out:
for event in events:
print(event)
if 'method' in event.get('params', {}).get('request', {}) and event.get('params', {}).get('request',
{}).get('method',
'') == 'POST':
li.append(event)
out.write(json.dumps(li))
but the issue is that it shows me requests from the first page I guess, even If I switch to iframe and it select me the right elements from iframe.
the flow is this way:
I make login to website then I redirect to main page and then I click on button and it open new tab and there I have the iframe, I switch the iframe, press on row on table and there is http request that take 5-10 seconds (in this time is pending status) when it success it make redirect to gmail website and the http request is disappeared because the redirect so I tried to add preserve logs but still.
I can't expose the https requests because it's of my job, but what I'm seeing is requests from the first page and not from the current iframe..
Ok, I'll try to be as clear as I can: selenium setup below is linux/selenium/chrome, you can adapt it to your own, just observe the imports, and the code after defining the browser/driver.
For intercepting browser's requests I used selenium-wire: https://pypi.org/project/selenium-wire/
If you prefer, you can use native selenium request intercepting.
I looked around for an example website containing an iframe with which you interact with, i.e. click a button (OP should have made the legwork and provide such example, but anyway).
Code:
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time as t
from datetime import datetime
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://fasecolda.com/ramos/automoviles/historial-de-accidentes-de-vehiculos-asegurados/'
browser.get(url)
for x in browser.requests:
print('URL:', x.url)
print('ORIGIN:', x.headers['origin'])
print('HOST:', x.headers['Host'])
print('SEC-FETCH-DEST:', x.headers['sec-fetch-dest'])
print('TIMESTAMP:', x.date)
print('______________________________________')
WebDriverWait(browser, 20).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//*[#title='Términos']")))
t.sleep(3)
print('switched to iframe')
button = WebDriverWait(browser,5).until(EC.element_to_be_clickable((By.XPATH, '//*[text()="Acepto los Términos y Condiciones"]')))
print('located the button, bringing it into view')
button.location_once_scrolled_into_view
print('now waiting 30 seconds, for clear separation of requests')
t.sleep(30)
print('printing last request again:')
print('URL:', browser.last_request.url)
print('ORIGIN:', browser.last_request.headers['origin'])
print('HOST:', browser.last_request.headers['Host'])
print('SEC-FETCH-DEST:', browser.last_request.headers['sec-fetch-dest'])
print('TIMESTAMP:', browser.last_request.date)
last_date = browser.last_request.date
print('clicked the button at', datetime.now())
button.click()
print('waiting another 30 seconds')
t.sleep(30)
print('and now printing the requests again (the ones after interacting with iframe)')
for x in browser.requests:
if x.date > last_date:
print('URL:', x.url)
print('ORIGIN:', x.headers['origin'])
print('HOST:', x.headers['Host'])
print('SEC-FETCH-DEST:', x.headers['sec-fetch-dest'])
print('TIMESTAMP:', x.date)
print('______________________________________')
As you can see, it's pretty straightforward:
go to website
print requests made (url, origin, host, sec-fetch-dest and timestamp)
locate the iframe and switch to it
locate the button you want to click on, and bring it into view
waiting 30 seconds, for any eventual requests made by JS in page
after 30 seconds, printing last request made (url, origin, host, sec-fetch-dest and timestamp) - also saving the timestamp into a variable, to be able to filter subsequent requests
clicking the button and registering the timestamp when we clicked it
waiting another 30 seconds, just to make sure all requests were performed
printing the requests made after the timestamp variable saved previously
The result in terminal:
[...]
______________________________________
URL: https://fonts.gstatic.com/s/roboto/v30/KFOlCnqEu92Fr1MmEU9fBBc4.woff2
ORIGIN: https://siniestroshava.com.co
HOST: None
SEC-FETCH-DEST: font
TIMESTAMP: 2022-10-08 21:44:44.794670
______________________________________
switched to iframe
located the button, bringing it into view
now waiting 30 seconds, for clear separation of requests
printing last request again:
URL: https://optimizationguide-pa.googleapis.com/v1:GetModels?key=AIzaSyCkfPOPZXDKNn8hhgu3JrA62wIgC93d44k
ORIGIN: None
HOST: None
SEC-FETCH-DEST: empty
TIMESTAMP: 2022-10-08 21:44:57.413952
clicked the button at 2022-10-08 21:45:19.036690
waiting another 30 seconds
and now printing the requests again (the ones after interacting with iframe)
URL: https://siniestroshava.com.co/hava/Seguridad/SolicitarCorreo
ORIGIN: None
HOST: siniestroshava.com.co
SEC-FETCH-DEST: iframe
TIMESTAMP: 2022-10-08 21:45:19.209288
______________________________________
URL: https://siniestroshava.com.co/hava/css/hava/estiloslocales.css
ORIGIN: None
HOST: siniestroshava.com.co
SEC-FETCH-DEST: style
TIMESTAMP: 2022-10-08 21:45:19.633076
______________________________________
URL: https://siniestroshava.com.co/hava/css/vendor.css?v=U1BT8Ls9ntdpDS12L5xpMjmSP3Eitncl_SyDnU5LLHk
ORIGIN: None
HOST: siniestroshava.com.co
SEC-FETCH-DEST: style
TIMESTAMP: 2022-10-08 21:45:19.645382
______________________________________
URL: https://siniestroshava.com.co/hava/css/devextreme/dx.material.hava.css
ORIGIN: None
HOST: siniestroshava.com.co
SEC-FETCH-DEST: style
TIMESTAMP: 2022-10-08 21:45:19.646197
______________________________________
[...]
______________________________________
​
As you can see, the main website is https://fasecolda.com, and iframe src is https://siniestroshava.com.co/. You can clearly observe all the requests made since loading the original page (I didn't post them all, too many), you can see the last request made before interacting with the iframe, the timestamp of interacting with iframe, and the subsequent request - the first one made having SEC-FETCH-DEST: iframe - obviously the request made by the iframe, due to us clicking the button. Also host and origin are relevant header keys, if they are present.
This is a method to isolate the requests made from the iframe, as opposed to the ones made from main page.
I believe this should answer your question as asked.

How To Scrape Content With Load More Pages Using Selenium Python

I need to scrape the titles for all blog post articles via a Load More button as set by my desired range for i in range(1,3):
At present I'm only able to capture the titles for the first page even though i'm able to navigate to the next page using selenium.
Any help would be much appreciated.
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
# Selenium Routine
from requests_html import HTMLSession
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
# Removes SSL Issues With Chrome
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('log-level=3')
options.add_argument('--disable-notifications')
#options.add_argument('--headless') # Comment to view browser actions
# Get website url
urls = "https://jooble.org/blog/"
r = requests.get(urls)
driver = webdriver.Chrome(executable_path="C:\webdrivers\chromedriver.exe",options=options)
driver.get(urls)
productlist = []
for i in range(1,3):
# Get Page Information
soup = BeautifulSoup(r.content, features='lxml')
items = soup.find_all('div', class_ = 'post')
print(f'LOOP: start [{len(items)}]')
for single_item in items:
title = single_item.find('div', class_ = 'front__news-title').text.strip()
print('Title:', title)
product = {
'Title': title,
}
productlist.append(product)
print()
time.sleep(5)
WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH,"//button[normalize-space()='Show more']"))).send_keys(Keys.ENTER)
driver.close()
# Save Results
df = pd.DataFrame(productlist)
df.to_csv('Results.csv', index=False)
It do not need selenium overhead in this case, cause you can use requests directly to get quetsion specific data via api.
Try to check the network tab in your browsers devtools if you click the button and you get the url that is requested to load more content. Iterate and set parameter value &page={i}.
Example
import requests
import pandas as pd
from bs4 import BeautifulSoup
data = []
for i in range (1,3):
url = f'https://jooble.org/blog/wp-admin/admin-ajax.php?id=&post_id=0&slug=home&canonical_url=https%3A%2F%2Fjooble.org%2Fblog%2F&posts_per_page=6&page={i}&offset=20&post_type=post&repeater=default&seo_start_page=1&preloaded=false&preloaded_amount=0&lang=en&order=DESC&orderby=date&action=alm_get_posts&query_type=standard'
r=requests.get(url)
if r.status_code != 200:
print(f'Error occured: {r.status_code} on url: {url}')
else:
soup = BeautifulSoup(str(r.json()['html']))
for e in soup.select('.type-post'):
data.append({
'title':e.select_one('.front__news-title').get_text(strip=True),
'description':e.select_one('.front__news-description').get_text(strip=True),
'url':e.a.get('href')
})
pd.DataFrame(data)
Output
title
description
url
0
How To Become A Copywriter
If you have a flair for writing, you might consider leveraging your talents to earn some dough by working as a copywriter. The primary aim of a copywriter is to…
https://jooble.org/blog/how-to-become-a-copywriter/
1
How to Find a Job in 48 Hours
A job search might sound scary for many people. However, it doesn't have to be challenging, long, or overwhelming. With Jooble, it is possible to find the best employment opportunities…
https://jooble.org/blog/how-to-find-a-job-in-48-hours/
2
17 Popular Jobs That Involve Working With Animals
If you are interested in caring for or helping animals, you can build a successful career in this field. The main thing is to find the right way. Working with…
https://jooble.org/blog/17-popular-jobs-that-involve-working-with-animals/
3
How to Identify Phishing and Email Scam
What Phishing and Email Scam Are Cybercrime is prospering, and more and more internet users are afflicted daily. The best example of an online scam is the phishing approach -…
https://jooble.org/blog/how-to-identify-phishing-and-email-scam/
4
What To Do After Getting Fired
For many people, thoughts of getting fired tend to be spine-chilling. No wonder, since it means your everyday life gets upside down in minutes. Who would like to go through…
https://jooble.org/blog/what-to-do-after-getting-fired/
5
A mobile application for a job search in 69 countries has appeared
Jooble, a job search site in 69 countries, has launched the Jooble Job Search mobile app for iOS and Android. It will help the searcher view vacancies more conveniently and…
https://jooble.org/blog/a-mobile-application-for-a-job-search-in-69-countries-has-appeared/
...

How to webscrape password protected website

I have a website from which I need to scrape some data (The website is https://www.merriam-webster.com/ and I want to scrape the saved words).
This website is password protected, and I also think there is some javascript stuff going on that I don't understand (I think certain elements are loaded by the browser since they don't show up when I wget the html).
I currently have a solution using selenium, it does work, but it requires firefox to be opened, and I would really like a solution where I can let it run as a console only programm in the background.
How would I archieve this, if possible using pythons requests library and the least amount of additional third party librarys?
Here is the code for my selenium solution:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import json
# Create new driver
browser = webdriver.Firefox()
browser.get('https://www.merriam-webster.com/login')
# Find fields for email and password
username = browser.find_element_by_id("ul-email")
password = browser.find_element_by_id('ul-password')
# Find button to login
send = browser.find_element_by_id('ul-login')
# Send username and password
username.send_keys("username")
password.send_keys("password")
# Wait for accept cookies button to appear and click it
WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, "accept-cookies-button"))).click()
# Click the login button
send.click()
# Find button to go to saved words
WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, "ul-favorites"))).click()
words = {}
# Now logged in
# Loop over pages of saved words
for i in range(2):
print("Now on page " + str(i+1))
# Find next page button
nextpage = browser.find_element_by_class_name("ul-page-next")
# Wait for the next page button to be clickable
WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, "ul-page-next")))
# Find all the words on the page
for word in browser.find_elements_by_class_name('item-headword'):
# Add the href to the dictonary
words[word.get_attribute("innerHTML")] = word.get_attribute("href")
# Naivgate to the next page
nextpage.click()
browser.close()
# Print the words list
with open("output.json", "w", encoding="utf-8") as file:
file.write(json.dumps(words, indent=4))
If you want to use the requests module you need to use a session.
To initialise a session you do:
session_requests = requests.session()
Then you need a payload with the username and password
payload = {
"username":<USERNAME>,
"password":<PASSWORD>}
Then to log in you do:
result = session_requests.post(
login_url,
data = payload,
headers = dict(referer=login_url)
)
Now your session should be logged in, so to go to any other password protect page you use the same session:
result = session_requests.get(
url,
headers = dict(referer = url)
)
Then you can use result.content to view the content of that page.
EDIT if your site includes a CSRF token you will need to include that in the `payload'. To get the CSRF token replace the "payload" section with:
from lxml import html
tree = html.fromstring(result.text)
#you may need to manually inspect the tree to find how your CSRF token is specified.
authenticity_token = list(set(tree.xpath("//input[#name='csrfmiddlewaretoken']/#value")))[0]
payload = {
"username":<USERNAME>,
"password":<PASSWORD>,
"csrfmiddlewaretoken":authenticity_token
}

Web scrape with multiple inputs and collect Total Margin required

I have a web link as:
url = "zerodha.com/margin-calculator/SPAN"
Here input parameters with sample values for reference mentioned below:
Exchange - NFO
Product - Options
Symbol - DHFL 27-JUN-19
Option Type - Calls
Strike Price - 120
Net Qty appears automatically as 1500,
and Use SELL Button then Click ADD Button.
I want to collect the Total Margin required (in above case its Rs 49,308) which appears in the right end.
You can just use requests. If you observe your network, you can see that it is making a POST requests with selected payload. This is how I would do it:
from requests import Session
BASE_URL = 'https://zerodha.com/margin-calculator/SPAN'
payload = {'action': 'calculate',
'exchange[]': 'NFO',
'product[]': 'FUT',
'scrip[]': 'DHFL19AUG',
'option_type[]': 'CE',
'strike_price[]':120,
'qty[]': 4000,
'trade[]': 'sell'
}
session = Session()
res = session.post(BASE_URL, data=payload)
data = res.json()
print(data)
I got the URL and Payload from observing network. This is what you will get as data in json form.
Results in chrome and python
Just observe how chrome or firefox send and receive data. And reverse engineer with your requests.
website link is dynamic rendering request table data. You should try automation selenium library. it allows you to scrape dynamic rendering request(js or ajax) page data.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import time
driver = webdriver.Chrome("/usr/bin/chromedriver")
driver.get("https://zerodha.com/margin-calculator/SPAN")
# select exchange option of NFO
exchange = driver.find_element_by_name('exchange[]')
exchange.send_keys("NFO")
# select product option of option
product = driver.find_element_by_name('product[]')
product.send_keys("OPT")
# select symbol by option value
symbol = Select(driver.find_element_by_name("scrip[]"))
symbol.select_by_value("DHFL19JUN")
# select option Type CELL option
optionType = driver.find_element_by_name('option_type[]')
optionType.send_keys("CE")
#add Strike price
strikePrice = driver.find_element_by_name('strike_price[]')
strikePrice.clear()
strikePrice.send_keys("120")
# add Net quantity
netQty = driver.find_element_by_name('qty[]')
netQty.clear()
netQty.send_keys("1500")
# select sell radio button
driver.find_elements_by_css_selector("input[name='trade[]'][value='sell']")[0].click()
#submit form
submit = driver.find_element_by_css_selector("input[type='submit'][value='Add']")
submit.click()
time.sleep(2)
# scrape margin
margin = driver.find_element_by_css_selector(".val.total")
print(margin.text)
where '/usr/bin/chromedriver' selenium web driver path.
Download selenium web driver for chrome browser:
http://chromedriver.chromium.org/downloads
Install web driver for chrome browser:
https://christopher.su/2015/selenium-chromedriver-ubuntu/
Selenium tutorial:
https://selenium-python.readthedocs.io/

Submit form that renders dynamically with Scrapy?

I'm trying to submit a dynamically generated user login form using Scrapy and then parse the HTML on the page that corresponds to a successful login.
I was wondering how I could do that with Scrapy or a combination of Scrapy and Selenium. Selenium makes it possible to find the element on the DOM, but I was wondering if it would be possible to "give control back" to Scrapy after getting the full HTML in order to allow it to carry out the form submission and save the necessary cookies, session data etc. in order to scrape the page.
Basically, the only reason I thought Selenium was necessary was because I needed the page to render from the Javascript before Scrapy looks for the <form> element. Are there any alternatives to this, however?
Thank you!
Edit: This question is similar to this one, but unfortunately the accepted answer deals with the Requests library instead of Selenium or Scrapy. Though that scenario may be possible in some cases (watch this to learn more), as alecxe points out, Selenium may be required if "parts of the page [such as forms] are loaded via API calls and inserted into the page with the help of javascript code being executed in the browser".
Scrapy is not actually a great fit for coursera site since it is extremely asynchronous. Parts of the page are loaded via API calls and inserted into the page with a help of javascript code being executed in the browser. Scrapy is not a browser and cannot handle it.
Which raises the point - why not use the publicly available Coursera API?
Aside from what is documented, there are other endpoints that you can see called in browser developer tools - you need to be authenticated to be able to use them. For example, if you are logged in, you can see the list of courses you've taken:
There is a call to memberships.v1 endpoint.
For the sake of an example, let's start selenium, log in and grab the cookies with get_cookies(). Then, let's yield a Request to memberships.v1 endpoint to get the list of archived courses providing the cookies we've got from selenium:
import json
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
LOGIN = 'email'
PASSWORD = 'password'
class CourseraSpider(scrapy.Spider):
name = "courseraSpider"
allowed_domains = ["coursera.org"]
def start_requests(self):
self.driver = webdriver.Chrome()
self.driver.maximize_window()
self.driver.get('https://www.coursera.org/login')
form = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#data-js='login-body']//div[#data-js='facebook-button-divider']/following-sibling::form")))
email = WebDriverWait(form, 10).until(EC.visibility_of_element_located((By.ID, 'user-modal-email')))
email.send_keys(LOGIN)
password = form.find_element_by_name('password')
password.send_keys(PASSWORD)
login = form.find_element_by_xpath('//button[. = "Log In"]')
login.click()
WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//h2[. = 'My Courses']")))
self.driver.get('https://www.coursera.org/')
cookies = self.driver.get_cookies()
self.driver.close()
courses_url = 'https://www.coursera.org/api/memberships.v1'
params = {
'fields': 'courseId,enrolledTimestamp,grade,id,lastAccessedTimestamp,role,v1SessionId,vc,vcMembershipId,courses.v1(display,partnerIds,photoUrl,specializations,startDate,v1Details),partners.v1(homeLink,name),v1Details.v1(sessionIds),v1Sessions.v1(active,dbEndDate,durationString,hasSigTrack,startDay,startMonth,startYear),specializations.v1(logo,name,partnerIds,shortName)&includes=courseId,vcMembershipId,courses.v1(partnerIds,specializations,v1Details),v1Details.v1(sessionIds),specializations.v1(partnerIds)',
'q': 'me',
'showHidden': 'false',
'filter': 'archived'
}
params = '&'.join(key + '=' + value for key, value in params.iteritems())
yield scrapy.Request(courses_url + '?' + params, cookies=cookies)
def parse(self, response):
data = json.loads(response.body)
for course in data['linked']['courses.v1']:
print course['name']
For me, it prints:
Algorithms, Part I
Computing for Data Analysis
Pattern-Oriented Software Architectures for Concurrent and Networked Software
Computer Networks
Which proves that we can give Scrapy the cookies from selenium and successfully extract the data from the "for logged in users only" pages.
Additionally, make sure you don't violate the rules from the Terms of Use, specifically:
In addition, as a condition of accessing the Sites, you agree not to
... (c) use any high-volume, automated or electronic means to access
the Sites (including without limitation, robots, spiders, scripts or
web-scraping tools);

Categories

Resources