Scrape "Button" tag with Selenium - python

import requests
from selenium import webdriver
import bs4
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)
oLat = 33.8026087
oLong = -84.3369491999999
dLat = 33.79149
dLong = -84.32312
url = "https://ride.lyft.com/ridetype?origin=" + str(oLat) + "%2C" + str(oLong) + "&destination=" + str(dLat) + "%2C" + str(dLong) + "&ride_type=&offerProductId=standard"
driver.get(url)
content = driver.page_source
soup = bs4.BeautifulSoup(content)
print(soup)
print(url)
Here is my code currently. I am trying to scrape the lyft price estimate.
The data is in the "button" tag. This does not show up in the html from the code I provided above. How can I get this data to show up?
import requests
from selenium import webdriver
import bs4
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)
oLat = 33.7885662
oLong = -84.326684
dLat = 33.4486296
dLong = -84.4550443
url = "https://ride.lyft.com/ridetype?origin=" + str(oLat) + "%2C" + str(oLong) + "&destination=" + str(dLat) + "%2C" + str(dLong) + "&ride_type=&offerProductId=standard"
driver.get(url)
spanThing = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR , "span.sc-7e9e68d9-0 lctkqn")))
print(spanThing)
driver.quit()
I tried this additional code, but it doesn't find the span and class for some reason. I'm not sure why

To extract the Page Source you need to induce WebDriverWait for the visibility_of_element_located() of a static element and you can use the following locator strategies:
oLat = 33.8026087
oLong = -84.3369491999999
dLat = 33.79149
dLong = -84.32312
url = "https://ride.lyft.com/ridetype?origin=" + str(oLat) + "%2C" + str(oLong) + "&destination=" + str(dLat) + "%2C" + str(dLong) + "&ride_type=&offerProductId=standard"
driver.get(url)
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//span[contains(., 'Sign up / Log in to request ride')]")))
print(driver.page_source)
driver.quit()
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Console Output:
<html lang="en-US" class="js-focus-visible" data-js-focus-visible=""><head><meta name="viewport" content="width=device-width"><script type="module">
if (window.performance) {
const toSnake = (str) => str.replace(/([A-Z])/g, function($1) {return '_' + $1.toLowerCase();});
const measure = () => {
const { timing } = window.performance;
if (!timing.navigationStart) return;
const al = [
'event_name','sending_service','connection_end','connection_start','dom_complete',
'dom_content_loaded_event_end','dom_content_loaded_event_start','dom_interactive',
'dom_loading','domain_lookup_end','domain_lookup_start','fetch_start','load_event_end',
'load_event_start','navigation_start','redirect_end','redirect_start','request_start',
'response_end','response_start','secure_connection_start','unload_event_end',
'unload_event_start','connect_start','connect_end','ms_first_paint','source','uri_path',
'request_end','code','track_id','uri_href'
];
const { href = '', pathname = '' } = window.location;
const sE = { event_name: 'navigation_timing_absolute', uri_href: href, uri_path: pathname, sending_service: 'riderweb', source: 'riderweb' };
for (let eN in timing) {
const sEN = toSnake(eN);
if (al.includes(sEN)) { sE[sEN] = timing[eN]; }
}
// iOS 11 supports ES modules, but sendBeacon not available until 11.3.
if (navigator.sendBeacon) {
navigator.sendBeacon('https://www.lyft.com/api/track', JSON.stringify(sE));
}
};
try {
if (document.readyState === 'complete') {
measure();
} else {
window.addEventListener('load', measure);
}
} catch(e) {}
}
</script><script>
var _i18n_extends = Object.assign || function (target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i]; for (var key in source) { if (Object.prototype.hasOwnProperty.call(source, key)) { target[key] = source[key]; } } } return target; };
;if(!window.__TRANSLATIONS__) window.__TRANSLATIONS__ = {};
window.__TRANSLATIONS__.locale = "en-US";
window.__TRANSLATIONS__.bundleName = "common";
if (!window.__TRANSLATIONS__.data) window.__TRANSLATIONS__.data = {};
_i18n_extends(window.__TRANSLATIONS__.data, {"%;":{"s":"OK"},"#":{"s":"Sorry, we can't find that page"},"$":{"s":"Sorry, there was an error"},"%":{"s":"Back"},"A":{"s":"No tip"},"T":{"s":"Lyft: Request a ride on the web"},"p":{"s":"Current location"},"q":{"s":"You set your pickup as \"Your Location\"{originatingAppMsg}"},"r":{"s":" in Google Maps"},"s":{"s":"To use the same pickup location, Lyft needs access to your current location."},"t":{"s":"Share your location"},"u":{"s":"Location sharing is denied"},"w":{"s":"Submit"},"x":{"s":"Save"},"y":{"s":"Confirm"},"z":{"s":"Unknown error"},"{":{"s":"Close"},"|":{"s":"Cancel"},"}":{"s":"Edit"},"~":{"s":"Delete"},"! ":{"s":"Done"},"!!":{"s":"Log out"},"!#":{"s":"Are you sure you want to log out?"},"!%":{"s":"Payment defaults"},"!&":{"s":"Add a payment method to get started."},"!(":{"s":"Add new card"},"!)":{"s":"Could not update payment method"},"!*":{"s":"Payment"},"!+":{"s":"manage your payment methods"},"!,":{"s":"Payment method"},"!-":{"s":"Card failed!"},"!.":{"s":"Payment method not supported on ride.lyft.com."},"!\u002F":{"s":"Payment method updated across Lyft apps."},"!0":{"s":"You cannot delete your only valid payment method."},"!1":{"s":"Gift cards"},"!2":{"s":"redeem gift cards"},"!3":{"s":"This field is required"},"!4":{"s":"Something went wrong. Please try again."},"!5":{"s":"Click to log out or switch accounts"},"!6":{"s":"Go back"},"!Z":{"s":"Schedule"},"!k":{"s":"schedule a ride"},"(6":{"s":"Ride"},"(7":{"s":"Rent"},"(8":{"s":"Rent a car through Lyft or our partner Sixt"},"(9":{"s":"Help"},"(:":{"s":"Business"},"(;":{"s":"Upcoming rides"},"(\u003C":{"s":"Install on Phone"},"(=":{"s":"Sign up \u002F Log in"},"(\u003E":{"s":"Log in"},"(l":{"s":"Install app"},"(m":{"s":"Free"},")z":{"s":"Not now"},"){":{"s":"Get the Lyft app"},")|":{"s":"More travel options from the palm of your hand"},")}":{"s":"From bikes to rentals and everything in between. If it gets you there, it's in the app."},"*\u003E":{"s":"Install on Desktop"},"*?":{"s":"Install on Desktop. It's free and takes up no space on your device"},"*C":{"s":"Text me a link"},"*D":{"s":"We'll send you a text with a link to download the app."},"*E":{"s":"Enter mobile phone number"},"*F":{"s":"Phone invalid"},"*G":{"s":"Refresh"},"*H":{"s":"An update is available"},",+":{"s":"View profile"},",,":{"s":"Get a ride"},",-":{"s":"Rides"},",.":{"s":"Gift cards"},",\u002F":{"s":"Promos"},",0":{"s":"Donate"},",1":{"s":"Invite friends"},",2":{"s":"Help"},",3":{"s":"Settings"},",4":{"s":"Safety Tools"},",5":{"s":"Lyft Rentals"},")d":{"s":"Log in \u002F Sign up"},")e":{"s":"You will need to log in to {action}!"},")f":{"s":"Log in"},")g":{"s":"Cancel"},"a":{"s":"Lyft and OpenStreetMap watermark"},"#L":{"s":"add promotions"},"&^":{"s":"Just now"},"&`.zero":{"s":"{minutes} minutes ago"},"&_.one":{"s":"{minutes} minute ago"},"&`.two":{"s":"{minutes} minutes ago"},"&`.few":{"s":"{minutes} minutes ago"},"&`.many":{"s":"{minutes} minutes ago"},"&`.other":{"s":"{minutes} minutes ago"},"&b.zero":{"s":"{hours} hours ago"},"&a.one":{"s":"{hours} hour ago"},"&b.two":{"s":"{hours} hours ago"},"&b.few":{"s":"{hours} hours ago"},"&b.many":{"s":"{hours} hours ago"},"&b.other":{"s":"{hours} hours ago"},"&d.zero":{"s":"{days} days ago"},"&c.one":{"s":"{days} day ago"},"&d.two":{"s":"{days} days ago"},"&d.few":{"s":"{days} days ago"},"&d.many":{"s":"{days} days ago"},"&d.other":{"s":"{days} days ago"},"&e":{"s":"Less than a minute"},"&g.zero":{"s":"{minutes} Total minutes"},"&f.one":{"s":"{minutes} Total minute"},"&g.two":{"s":"{minutes} Total minutes"},"&g.few":{"s":"{minutes} Total minutes"},"&g.many":{"s":"{minutes} Total minutes"},"&g.other":{"s":"{minutes} Total minutes"},"&i.zero":{"s":"{hours} Total hours"},"&h.one":{"s":"{hours} Total hour"},"&i.two":{"s":"{hours} Total hours"},"&i.few":{"s":"{hours} Total hours"},"&i.many":{"s":"{hours} Total hours"},"&i.other":{"s":"{hours} Total hours"},"&k.zero":{"s":"{days} Total days"},"&j.one":{"s":"{days} Total day"},"&k.two":{"s":"{days} Total days"},"&k.few":{"s":"{days} Total days"},"&k.many":{"s":"{days} Total days"},"&k.other":{"s":"{days} Total days"},"(a":{"s":"Any fare exceeding your Lyft Cash balance will be charged to your default payment method."},"(|":{"s":"Total"},"(}":{"s":"You'll pay this price unless you add a stop, change your destination, or if credit expires."},"(~":{"s":"This is an estimated range for your trip."},") ":{"s":"\u003CLink\u003ELog in\u003C\u002FLink\u003E or sign up to lock in your price and request a ride."},")?":{"s":"Driver Name:"},")#":{"s":"Driver's car image"},")A":{"s":"License Plate Number:"},")B":{"s":"Pick up"},")C":{"s":"Picked up"},")D":{"s":"Drop-off"},")E":{"s":"Dropped off"},")F":{"s":"Current location"},")c":{"s":"Close banner"},")y":{"s":"Riders"},"*I":{"s":"Add card"},"*J":{"s":"Edit {cardLabel}"},"+2":{"s":"$10"},"+3":{"s":"$8"},"+4":{"s":"$10"},"+5":{"s":"Unlimited 180-min classic rides for 24 hours"},"+6":{"s":"$15"},"+7":{"s":"Unlimited 30-min classic rides for 24 hours"},"+8":{"s":"Your payment info will be stored securely."},",#":{"s":"Please follow \u003CSupportLink\u003Ethese instructions\u003C\u002FSupportLink\u003E to allow this site to show notifications."},",$":{"s":"Notifications are blocked"},",\u003C":{"s":"Session expired"},",=":{"s":"You have been logged out. Please log back in to continue."},"!u":{"s":"Click to edit your pickup location"},"%\u002F":{"s":"You must \u003CLink\u003Elog in\u003C\u002FLink\u003E to {action}."},"&J":{"s":"Something went wrong. Unable to load your referral history. Please try again."},"7f523512b795a02fd9b9b05a1e22ff9b":{"s":"Card number"},"3effb3a930ea2ce61705bffc624e19b6":{"s":"Expiration"},"755c8f863223ae3f7ac0ac1cfe8b3072":{"s":"Name on card"},"22b715147b81b76566fa183406659069":{"s":"Country"},"4b3d5e03b24b6bbc630d15ad2251755f":{"s":"Billing address"},"e0a8872668d31bb76156a8d80a5d7a6c":{"s":"City"},"f420cf2cf310bbff1ead064745e66ec1":{"s":"State"},"8e9d206ff46216065a42a3953a63bd9f":{"s":"Province \u002F Territory"},"9dca7ddd59d7aca64aae58c7a99e16ce":{"s":"State \u002F Province"},"50be4be10369e747d757e7b2db2c9ed3":{"s":"Zip code"},"11ceb56a912fd18cc9ea1054c5405c13":{"s":"Postal code"},"5a0a89ab4fd1ceebfd9f68b88d27e685":{"s":"Save"},"45c9b92858c6ce6b50c1967661063ae8":{"s":"Cancel"},"29fc403cabcebe790ddd09c592f7e7cd":{"s":"There was a problem reading your card details. Please try again."},"1ae24aeff3771f629b2f865074b68050":{"s":"You must be logged in to add a payment method."},"275c89584bcddfbf0019d8d5a2ce6128":{"s":"You must be logged in to edit a payment method."},"2a420e791e0ec6d47cb64d5fab8376a9":{"s":"Please fill out all required fields"},"a966a08942254351695c6993e781301e":{"s":"Something went wrong. Please check your information and try again"}});
</script><meta charset="utf-8"><meta content="IE=Edge" http-equiv="X-UA-Compatible"><meta name="google" content="notranslate"><meta http-equiv="Accept-CH" content="DPR, Viewport-Width, Width, Downlink, Save-Data, Content-DPR"><link rel="home" href="https://ride.lyft.com"><link rel="canonical" href="https://ride.lyft.com"><link rel="icon" href="https://cdn.lyft.com/static/www-meta-assets/favicon.ico"><link rel="shortcut icon" sizes="192x192" href="https://cdn.lyft.com/static/riderweb/images/icons/icon-192x192.png"><link rel="apple-touch-startup-image" href="https://cdn.lyft.com/static/riderweb/images/icons/icon-192x192.png"><link rel="apple-touch-icon" href="https://cdn.lyft.com/static/riderweb/images/icons/icon-192x192.png"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="black-translucent"><meta property="og:title" content="Lyft: Request a ride on the web"><meta property="og:url" content="https://ride.lyft.com"><meta name="twitter:card" content="summary_large_image"><meta name="twitter:site" content="#lyft"><meta name="msapplication-starturl" content="https://ride.lyft.com"><link rel="stylesheet" href="https://cdn.lyft.com/coreui/base.4.6.5.css"><meta name="google-site-verification" content="V9fk-oLTj9Ewu7Kc6Vetf94qp8HZ3gfjxFMkn8LmZ3Y"><link rel="manifest" href="/manifest.json" crossorigin="use-credentials"><meta name="theme-color" content="#FFFFFF"><meta name="description" content="Request a Lyft ride in a web browser on your phone, tablet, or laptop – no app download required. Get a ride from a friendly driver in minutes."><meta property="og:description" content="Request a Lyft ride in a web browser on your phone, tablet, or laptop – no app download required. Get a ride from a friendly driver in minutes."><meta property="og:image" content="/images/share.png">
.
,
<next-route-announcer><p aria-live="assertive" id="__next-route-announcer__" role="alert" style="border: 0px; clip: rect(0px, 0px, 0px, 0px); height: 1px; margin: -1px; overflow: hidden; padding: 0px; position: absolute; width: 1px; white-space: nowrap; overflow-wrap: normal;"></p></next-route-announcer><iframe name="__privateStripeMetricsController9540" frameborder="0" allowtransparency="true" scrolling="no" role="presentation" allow="payment *" src="https://js.stripe.com/v3/m-outer-93afeeb17bc37e711759584dbfc50d47.html#url=https%3A%2F%2Fride.lyft.com%2Fridetype%3Forigin%3D33.8026087%252C-84.3369491999999%26destination%3D33.79149%252C-84.32312%26ride_type%3D%26offerProductId%3Dstandard&title=Lyft%3A%20Price%20estimate&referrer=&muid=NA&sid=NA&version=6&preview=false" aria-hidden="true" tabindex="-1" style="border: none !important; margin: 0px !important; padding: 0px !important; width: 1px !important; min-width: 100% !important; overflow: hidden !important; display: block !important; visibility: hidden !important; position: fixed !important; height: 1px !important; pointer-events: none !important; user-select: none !important;"></iframe></body></html>

Related

Handling Date picker using Selenium

<input placeholder="MM/DD/YYYY" autocomplete="on" type="text" class="form-control" value="01/01/2020" style="height: 40px; color: (25, 25, 25); font-weight: bold; font-size: 14px; background: >
error: Message: element not interactable
Can you check this
#You can select the datepicker based on the XPath index [1][2]
date_input = driver.find_element_by_xpath('((//input[#type='text']))')
date_input.click()
date_input.send_keys(Keys.CONTROL, "a")
date_input.send_keys(Keys.BACKSPACE)
date_input.send_keys("02/14/2020",Keys.RETURN)
This is my python script for date picker. Hope this can be useful in someways.
from selenium import webdriver
#set chromodriver.exe path
driver = webdriver.Chrome(executable_path="C:\\chromedriver.exe")
driver.implicitly_wait(0.5)
#launch URL
driver.get("https://jqueryui.com/datepicker/")
#switch to frame
l = driver.find_element_by_xpath("//iframe[#class='demo-frame']")
driver.switch_to.frame(l);
#identify element inside frame
d= driver.find_element_by_id("datepicker")
d.click()
#identify list of all dates
m = driver.find_elements_by_xpath("//table/tbody/tr/td")
#iterate over list
for i in m:
#verify required date then click
if i.text == '3':
i.click()
break
#get selected date
s = d.get_attribute('value')
print("Date entered is: ")
print(s)
#browser quit
driver.quit()

Print out selenium text variable

I have a function which extracts data from a Twitter page however when the script completes I receive no outputs. The function is meant to output various information from a tweet. Im just trying to print out the second tweet on the page.
card definition
Function
def get_tweet_data(card):
username - card.find_element_by_xpath(".//span").text
handle = card.find_element_by_xpath('.//span[contains(text(), "#" )]').text #
try:
postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')
except NoSuchElementException:
return
comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text
responding = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text
text = comment + responding # add the both text fields together
reply_cnt = card.find_element_by_xpath('.//div[#data-testid="reply"]').text
retweet_cnt = card.find_element_by_xpath('.//div[#data-testid="retweet"]').text
like_cnt = card.find_element_by_xpath('.//div[#data-testid="like"]').text
tweet = (username, handle, postdate, text, reply_cnt, retweet_cnt, like_cnt)
return tweet
Command line arguments
python twitter.py get_tweet_data(1)
So, this one took a while; but, I was able to get the information for you. When I went through Twitter's HTML, 6 different xpath calls were needed
# Count of number of Tweets
(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])
# First Card
(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])[1]
# Twiter Card Likes, Retweets, Replies
(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])[1]//div[contains(#aria-label, 'likes')]
# Twitter's Text Content
(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])[1]//div[#lang]
# Twitter's DateTime
(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])[1]//time[#datetime]
# Twitter href is the Twitter Account Poster
((//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])[1]//a[#role='link'])[1]
Once I determined the proper xpath calls, I, then, created a class to store my data
class Twitter_Info:
"""This class contains the information regarding to the Twitter Card"""
CardNumber : int
Likes : int
Retweets : int
Replies : int
ContentInfo : str
PostDate : str
PosterAccount : str
def print_info(self):
print(f'Card Number: {self.CardNumber}')
print(f'Poster Account: {self.PosterAccount}')
print(f'Tweet Date: {self.PostDate}')
print(f'Likes: {self.Likes}')
print(f'Replies: {self.Replies}')
print(f'Retweets: {self.Retweets}')
print(f'Tweet Content: {self.ContentInfo}')
Once this was accomplished, I added different methods to help accomplish the task at hand
wait_for_tweets_to_load
number_of_tweets_displayed
scroll_to_card
get_card_likes_retweets_replies
get_card_text_content
get_card_datetime
get_card_poster_info
Once these were determined, I was able to scroll to each card and scrape the data
MAIN PROGRAM - For Reference
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as DriverWait
from selenium.webdriver.support import expected_conditions as DriverConditions
from selenium.common.exceptions import WebDriverException
import time
class Twitter_Info:
"""This class contains the information regarding to the Twitter Card"""
CardNumber : int
Likes : int
Retweets : int
Replies : int
ContentInfo : str
PostDate : str
PosterAccount : str
def print_info(self):
print(f'Card Number: {self.CardNumber}')
print(f'Poster Account: {self.PosterAccount}')
print(f'Tweet Date: {self.PostDate}')
print(f'Likes: {self.Likes}')
print(f'Replies: {self.Replies}')
print(f'Retweets: {self.Retweets}')
print(f'Tweet Content: {self.ContentInfo}')
def get_chrome_driver():
"""This sets up our Chrome Driver and returns it as an object"""
path_to_chrome = "F:\Selenium_Drivers\Windows_Chrome85_Driver\chromedriver.exe"
chrome_options = webdriver.ChromeOptions()
# Browser is displayed in a custom window size
chrome_options.add_argument("window-size=1500,1000")
return webdriver.Chrome(executable_path = path_to_chrome,
options = chrome_options)
def wait_displayed(driver : ChromeDriver, xpath: str, int = 5):
try:
DriverWait(driver, int).until(
DriverConditions.presence_of_element_located(locator = (By.XPATH, xpath))
)
except:
raise WebDriverException(f'Timeout: Failed to find {xpath}')
def is_displayed(driver : ChromeDriver, xpath: str, int = 5):
try:
webElement = DriverWait(driver, int).until(
DriverConditions.presence_of_element_located(locator = (By.XPATH, xpath))
)
return True if webElement != None else False
except:
return False
def scroll_to_element(driver : ChromeDriver, xpath: str, int = 5):
try:
webElement = DriverWait(driver, int).until(
DriverConditions.presence_of_element_located(locator = (By.XPATH, xpath))
)
driver.execute_script("arguments[0].scrollIntoView();", webElement)
except:
raise WebDriverException(f'Timeout: Failed to find {xpath}\nResult: Failed to Scroll')
def wait_for_tweets_to_load(driver : ChromeDriver):
if is_displayed(driver, "//main[#role='main']//div[#data-testid='primaryColumn']//div[contains(#aria-label, 'Loading Tweets')]"):
for counter in range(10):
if is_displayed(driver, "//main[#role='main']//div[#data-testid='primaryColumn']//div[contains(#aria-label, 'Loading Tweets')]") and counter == 9:
raise Exception("Page Failed To Load Tweets")
elif is_displayed(driver, "//main[#role='main']//div[#data-testid='primaryColumn']//div[contains(#aria-label, 'Loading Tweets')]") == False:
break
else:
time.sleep(3)
def number_of_tweets_displayed(driver : ChromeDriver):
"""Note: This number will change dynamically when we scroll down on the page ( new Tweets will start loading )"""
xpath = "{0}{1}{2}".format("(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']",
"//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]",
"//article[#role='article']//div[#data-testId='tweet'])")
return driver.find_elements(By.XPATH, xpath).__len__()
def scroll_to_card(driver : ChromeDriver, card_number : int):
xpath = "{0}{1}{2}".format("(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']",
"//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]",
"//article[#role='article']//div[#data-testId='tweet'])")
scroll_to_element(driver, xpath = f'{xpath}[{card_number}]')
def get_card_likes_retweets_replies(driver : ChromeDriver, card_number : int):
xpath = "{0}{1}{2}".format("(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']",
"//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]",
"//article[#role='article']//div[#data-testId='tweet'])")
xpath = f'{xpath}[{card_number}]//div[contains(#aria-label, "likes")]'
return driver.find_element(By.XPATH, xpath).get_attribute('aria-label').split(',')
def get_card_text_content(driver : ChromeDriver, card_number : int):
xpath = "{0}{1}{2}".format("(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']",
"//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]",
"//article[#role='article']//div[#data-testId='tweet'])")
xpath = f'{xpath}[{card_number}]//div[#lang]'
return driver.find_element(By.XPATH, xpath).text
def get_card_datetime(driver : ChromeDriver, card_number : int):
xpath = "{0}{1}{2}".format("(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']",
"//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]",
"//article[#role='article']//div[#data-testId='tweet'])")
xpath = f'{xpath}[{card_number}]//time[#datetime]'
return driver.find_element(By.XPATH, xpath).get_attribute('datetime')
def get_card_poster_info(driver : ChromeDriver, card_number : int):
xpath = "{0}{1}{2}".format("((//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']",
"//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']",
"//div[#data-testId='tweet'])")
xpath = f'{xpath}[{card_number}]//a[#role="link"])[1]'
return driver.find_element(By.XPATH, xpath).get_attribute('href')
# Gets our chrome driver and opens our site
chrome_driver = get_chrome_driver()
chrome_driver.get("https://twitter.com/bbc")
wait_displayed(chrome_driver, "//div[#data-testid='placementTracking']//div[#role='button']//span[text()='Follow']")
wait_displayed(chrome_driver, "//section[#aria-label='Sign up']")
wait_displayed(chrome_driver, "//aside[#aria-label='Who to follow']")
wait_for_tweets_to_load(chrome_driver)
# Get number of Tweets that are displayed
numberOfTweetsDisplayed = number_of_tweets_displayed(chrome_driver)
twitter_cards = []
# Scrape Card Information
for cards in range(numberOfTweetsDisplayed):
scroll_to_card(chrome_driver, (cards + 1))
twitter_card = Twitter_Info()
twitter_card.CardNumber = cards + 1
# Get the Like | Retweet | Replies Info
raw_info = get_card_likes_retweets_replies(chrome_driver, (cards + 1))
twitter_card.Replies = raw_info[0].strip().split(' ')[0]
twitter_card.Retweets = raw_info[1].strip().split(' ')[0]
twitter_card.Likes = raw_info[2].strip().split(' ')[0]
# Get rest of our data
twitter_card.ContentInfo = get_card_text_content(chrome_driver, (cards + 1))
twitter_card.PostDate = get_card_datetime(chrome_driver, (cards + 1))
twitter_card.PosterAccount = get_card_poster_info(chrome_driver, (cards + 1))
# Display our information and add it to our list
twitter_card.print_info()
twitter_cards.append(twitter_card)
print(f'Added Card Number {(cards + 1)} successfully')
print('========================================================\n')
# Print how many twitter cards were scraped
print(f'Twitter Cards Added: {twitter_cards.__len__()}')
chrome_driver.quit()
chrome_driver.service.stop()
SAMPLE OUTPUT
Card Number: 1
Poster Account: https://twitter.com/BBC
Tweet Date: 2020-06-22T11:22:53.000Z
Likes: 1106
Replies: 2827
Retweets: 841
Tweet Content: We’ve always been here to celebrate diversity. But we need to do more, and we will.
This is our commitment to long-term change. #RightTheScript
Read more about our £100m commitment here: https://bbc.in/37OPMLv
Added Card Number 1 successfully
========================================================
Card Number: 2
Poster Account: https://twitter.com/BBC
Tweet Date: 2020-11-16T17:01:00.000Z
Likes: 100
Replies: 10
Retweets: 36
Tweet Content: More than 100 intact sarcophagi, dating back 2,500 years, have been unearthed near Cairo.
Added Card Number 2 successfully
========================================================
Card Number: 3
Poster Account: https://twitter.com/BBC
Tweet Date: 2020-11-15T16:01:00.000Z
Likes: 68
Replies: 5
Retweets: 16
Tweet Content: With Cornish wildlife facing so many threats from humans, these residents do whatever they can to help
#Cornwall with
#simon_reeve
| 8:10pm |
#bbctwo
&
#bbciplayer
.
Added Card Number 3 successfully
========================================================
Card Number: 4
Poster Account: https://twitter.com/bbcasiannetwork
Tweet Date: 2020-11-14T09:44:41.000Z
Likes: 133
Replies: 7
Retweets: 33
Tweet Content: Happy Diwali and Bandi Chhor Divas!
Added Card Number 4 successfully
========================================================
Card Number: 5
Poster Account: https://twitter.com/BBC
Tweet Date: 2020-11-13T22:18:26.000Z
Likes: 443
Replies: 13
Retweets: 86
Tweet Content: It's the clash of the tennis titans
#Andy_Murray
and... er,
#petercrouch
?
#ChildrenInNeed
Added Card Number 5 successfully
========================================================
Card Number: 6
Poster Account: https://twitter.com/BBC
Tweet Date: 2020-11-13T20:57:23.000Z
Likes: 426
Replies: 25
Retweets: 109
Tweet Content: The official video for this year's star-studded
#bbccin
single, 'Stop Crying Your Heart Out' is here!
Watch now and don't forget to download the song to support #ChildrenInNeed
https://bbc.in/32I60EZ
Added Card Number 6 successfully
========================================================
Card Number: 7
Poster Account: https://twitter.com/BBC
Tweet Date: 2020-11-13T15:37:06.000Z
Likes: 18
Replies: 7
Retweets: 7
Tweet Content: It's time for #ChildrenInNeed
2020!
Starting RIGHT NOW on
#BBCOne
&
#BBCiPlayer
http://bbc.in/3kuv1cG
Added Card Number 7 successfully
========================================================
Twitter Cards Added: 7

Python sorting html table

I am looping through a list of servers and connecting with OpenSSL, to retrieve the SSL cert, and grabbing the server name, the date the cert expires, and calculating the number of days until cert expires. I am then building an html table with the data. The columns are Host, Hostname, Expiration Date, and Remaining Days. What is the best way to sort the table by the "Remaining Days" column?
# Update the hosts entry
ssl_results[str(ip)][0] = host
ssl_results[str(ip)][1] = server_name
ssl_results[str(ip)][2] = exp_date
ssl_results[str(ip)][3] = days_to_expire
# Loop through the ssl_results entries and generate a email + results file
try:
# variable to hold html for email
SSLCertificates = """<html>
<head>
<style>
table{width: 1024px;}
table, th, td {
border: 1px solid black;
border-collapse: collapse;
}
th, td {
padding: 5px;
text-align: left;
}
ul:before{
content:attr(data-header);
font-size:120%;
font-weight:bold;
margin-left:-15px;
}
</style>
</head>
<body>
<p><h2>Blah, </h2>
<h3>SSL Expiration Summary:</h3>
<span style="color:red;"><b>Blah Blah Blah.<b></span><br><br>
<table id=\"exp_ssls\"><tr><th>Host</th><th>Hostname</th><th>Expiration Date</th><th>Remaining Days</th></tr>
"""
for entries in ssl_results:
SSLCertificates += "<tr><td>" + str(entries) + "</td><td>" + str(ssl_results[entries][1]) + "</td><td>" + str(
ssl_results[entries][2]) + "</td><td>" + str(ssl_results[entries][3]) + "</td></tr>"
SSLCertificates += """</body>
</html>"""
f = open('SSLCertificates.html', 'w')
f.write(SSLCertificates)
f.close()
filename = 'SSLCertificates.html'
attachment = open(filename, 'rb')
Sort the dict before you form the html tags. then Iterate thru the dict and print it using html tags. Use sorted() to sort your dict before you iterate thru it.
import operator
x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
sorted_x = sorted(x.items(), key=operator.itemgetter(1))
sorted_x will be a list of tuples sorted by the second element in each tuple. dict(sorted_x) == x.

Selenium doesn't find elements

I'm coding in python with selenium webdriver to automate some stuff, but selenium's find_element_* methods don't work.
These are the webpages I am trying:
https://campus.webex.com/mw3300/mywebex/default.do?siteurl=campus&service=10
and
https://webmail.aruba.it/cgi-bin/ajaxmail
and
http://campus.istitutovolta.eu/index.php
I tried all strategies: by_class, by_link, by_name, by_id, ecc.... nothing!! I use gecko and firefox. Here is the code:
from selenium import webdriver
browser = webdriver.Firefox()
try:
browser.get("https://campus.webex.com/mw3100/mywebex/default.do?siteurl=campus&service=7")
except:
print("pagina non trovata")
try:
utente=browser.find_element_by_name("userName")
except:
print("elemento non trovato")
else:
print(utente)
utente.send_keys('user#gestione.eu')
try:
psw=browser.find_element_by_name("PASSWD")
except:
print("elemento non trovato")
else:
print(psw)
psw.send_keys('123456')
UPDATE 2 + 3:
I GOT IT WORK: CHECK IT OUT:
Variant 1:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Firefox()
delay = 100 # seconds
# browser.get("https://campus.webex.com/mw3100/mywebex/default.do?siteurl=campus")
browser.get("https://campus.webex.com/mw3300/mywebex/login/login.do?siteurl=campus&login_return_url=%2Ftc3300%2Ftrainingcenter%2Fsite%2FinstantSession.do%3Fsiteurl%3Dcampus")
myelem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, 'mwx-ipt-username')))
utente = browser.find_element_by_id("mwx-ipt-username").send_keys('user#gestione.eu')
Variant 2:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Firefox()
delay = 20 # seconds
browser.get("https://campus.webex.com/mw3100/mywebex/default.do?siteurl=campus")
browser.switch_to.frame("mainFrame")
browser.switch_to.frame("main")
myelem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, 'mwx-ipt-username')))
utente = browser.find_element_by_id("mwx-ipt-username").send_keys('user#gestione.eu')
The problem was the frameset in the site. In parsed selenium html code,
the content of the frames was missing, so synonymous, the login form was not found. This is remedied by a) variant 1: You directly open the appropriate frame. b) variant 2: you switch from the original page into the frame.
see: How to navigate a subframe inside a frameset using Selenium WebDriver with Python?
function for switching frames in python, selenium
How to identify and switch to the frame in selenium webdriver when frame does not have id
How to navigate a subframe inside a frameset using Selenium WebDriver?
UPDATE: I will try to find a solution trough the buttons the website provides. Give me some time.
I clicked on your link and checked to "userName" and there was no "userName".
It's the same, if you open a new tab in your normal firefox, and type
"view-source:https://campus.webex.com/mw3300/mywebex/default.do?siteurl=campus" in it.
I tried this:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Firefox()
delay = 30 # seconds
try:
browser.get("https://campus.webex.com/mw3100/mywebex/default.do?siteurl=campus&service=7")
except:
print("pagina non trovata")
myelem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, 'mwx-ipt-username')))
try:
utente = browser.find_element_by_id("mwx-ipt-username")
except:
print("elemento non trovato")
It doesn't work.
The HTML i always get:
<!DOCTYPE html>
<html lang="it-IT">
<HEAD>
<meta http-equiv="X-UA-Compatible" content="IE=Edge">
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<meta name='format-detection' content='telephone=no'>
<meta name='slack-app-id' content='A5P5FDK33'>
<meta name="description" content="5"><link rel="shortcut icon" href="/favicont29.ico" type="image/x-icon">
<script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={xpid:"XQUDUldSGwUCXFdWAAgF"};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var o=e[n]={exports:{}};t[n][0].call(o.exports,function(e){var o=t[n][1][e];return r(o||e)},o,o.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<n.length;o++)r(n[o]);return r}({1:[function(t,e,n){function r(t){try{s.console&&console.log(t)}catch(e){}}var o,i=t("ee"),a=t(21),s={};try{o=localStorage.getItem("__nr_flags").split(","),console&&"function"==typeof console.log&&(s.console=!0,o.indexOf("dev")!==-1&&(s.dev=!0),o.indexOf("nr_dev")!==-1&&(s.nrDev=!0))}catch(c){}s.nrDev&&i.on("internal-error",function(t){r(t.stack)}),s.dev&&i.on("fn-err",function(t,e,n){r(n.stack)}),s.dev&&(r("NR AGENT IN DEVELOPMENT MODE"),r("flags: "+a(s,function(t,e){return t}).join(", ")))},{}],2:[function(t,e,n){function r(t,e,n,r,s){try{l?l-=1:o(s||new UncaughtException(t,e,n),!0)}catch(f){try{i("ierr",[f,c.now(),!0])}catch(d){}}return"function"==typeof u&&u.apply(this,a(arguments))}function UncaughtException(t,e,n){this.message=t||"Uncaught error with no additional information",this.sourceURL=e,this.line=n}function o(t,e){var n=e?null:c.now();i("err",[t,n])}var i=t("handle"),a=t(22),s=t("ee"),c=t("loader"),f=t("gos"),u=window.onerror,d=!1,p="nr#seenError",l=0;c.features.err=!0,t(1),window.onerror=r;try{throw new Error}catch(h){"stack"in h&&(t(13),t(12),"addEventListener"in window&&t(6),c.xhrWrappable&&t(14),d=!0)}s.on("fn-start",function(t,e,n){d&&(l+=1)}),s.on("fn-err",function(t,e,n){d&&!n[p]&&(f(n,p,function(){return!0}),this.thrown=!0,o(n))}),s.on("fn-end",function(){d&&!this.thrown&&l>0&&(l-=1)}),s.on("internal-error",function(t){i("ierr",[t,c.now(),!0])})},{}],3:[function(t,e,n){t("loader").features.ins=!0},{}],4:[function(t,e,n){function r(){M++,N=y.hash,this[u]=g.now()}function o(){M--,y.hash!==N&&i(0,!0);var t=g.now();this[h]=~~this[h]+t-this[u],this[d]=t}function i(t,e){E.emit("newURL",[""+y,e])}function a(t,e){t.on(e,function(){this[e]=g.now()})}var s="-start",c="-end",f="-body",u="fn"+s,d="fn"+c,p="cb"+s,l="cb"+c,h="jsTime",m="fetch",v="addEventListener",w=window,y=w.location,g=t("loader");if(w[v]&&g.xhrWrappable){var b=t(10),x=t(11),E=t(8),O=t(6),P=t(13),R=t(7),T=t(14),L=t(9),j=t("ee"),S=j.get("tracer");t(15),g.features.spa=!0;var N,M=0;j.on(u,r),j.on(p,r),j.on(d,o),j.on(l,o),j.buffer([u,d,"xhr-done","xhr-resolved"]),O.buffer([u]),P.buffer(["setTimeout"+c,"clearTimeout"+s,u]),T.buffer([u,"new-xhr","send-xhr"+s]),R.buffer([m+s,m+"-done",m+f+s,m+f+c]),E.buffer(["newURL"]),b.buffer([u]),x.buffer(["propagate",p,l,"executor-err","resolve"+s]),S.buffer([u,"no-"+u]),L.buffer(["new-jsonp","cb-start","jsonp-error","jsonp-end"]),a(T,"send-xhr"+s),a(j,"xhr-resolved"),a(j,"xhr-done"),a(R,m+s),a(R,m+"-done"),a(L,"new-jsonp"),a(L,"jsonp-end"),a(L,"cb-start"),E.on("pushState-end",i),E.on("replaceState-end",i),w[v]("hashchange",i,!0),w[v]("load",i,!0),w[v]("popstate",function(){i(0,M>1)},!0)}},{}],5:[function(t,e,n){function r(t){}if(window.performance&&window.performance.timing&&window.performance.getEntriesByType){var o=t("ee"),i=t("handle"),a=t(13),s=t(12),c="learResourceTimings",f="addEventListener",u="resourcetimingbufferfull",d="bstResource",p="resource",l="-start",h="-end",m="fn"+l,v="fn"+h,w="bstTimer",y="pushState",g=t("loader");g.features.stn=!0,t(8);var b=NREUM.o.EV;o.on(m,function(t,e){var n=t[0];n instanceof b&&(this.bstStart=g.now())}),o.on(v,function(t,e){var n=t[0];n instanceof b&&i("bst",[n,e,this.bstStart,g.now()])}),a.on(m,function(t,e,n){this.bstStart=g.now(),this.bstType=n}),a.on(v,function(t,e){i(w,[e,this.bstStart,g.now(),this.bstType])}),s.on(m,function(){this.bstStart=g.now()}),s.on(v,function(t,e){i(w,[e,this.bstStart,g.now(),"requestAnimationFrame"])}),o.on(y+l,function(t){this.time=g.now(),this.startPath=location.pathname+location.hash}),o.on(y+h,function(t){i("bstHist",[location.pathname+location.hash,this.startPath,this.time])}),f in window.performance&&(window.performance["c"+c]?window.performance[f](u,function(t){i(d,[window.performance.getEntriesByType(p)]),window.performance["c"+c]()},!1):window.performance[f]("webkit"+u,function(t){i(d,[window.performance.getEntriesByType(p)]),window.performance["webkitC"+c]()},!1)),document[f]("scroll",r,{passive:!0}),document[f]("keypress",r,!1),document[f]("click",r,!1)}},{}],6:[function(t,e,n){function r(t){for(var e=t;e&&!e.hasOwnProperty(u);)e=Object.getPrototypeOf(e);e&&o(e)}function o(t){s.inPlace(t,[u,d],"-",i)}function i(t,e){return t[1]}var a=t("ee").get("events"),s=t(24)(a,!0),c=t("gos"),f=XMLHttpRequest,u="addEventListener",d="removeEventListener";e.exports=a,"getPrototypeOf"in Object?(r(document),r(window),r(f.prototype)):f.prototype.hasOwnProperty(u)&&(o(window),o(f.prototype)),a.on(u+"-start",function(t,e){var n=t[1],r=c(n,"nr#wrapped",function(){function t(){if("function"==typeof n.handleEvent)return n.handleEvent.apply(n,arguments)}var e={object:t,"function":n}[typeof n];return e?s(e,"fn-",null,e.name||"anonymous"):n});this.wrapped=t[1]=r}),a.on(d+"-start",function(t){t[1]=this.wrapped||t[1]})},{}],7:[function(t,e,n){function r(t,e,n){var r=t[e];"function"==typeof r&&(t[e]=function(){var t=r.apply(this,arguments);return o.emit(n+"start",arguments,t),t.then(function(e){return o.emit(n+"end",[null,e],t),e},function(e){throw o.emit(n+"end",[e],t),e})})}var o=t("ee").get("fetch"),i=t(21);e.exports=o;var a=window,s="fetch-",c=s+"body-",f=["arrayBuffer","blob","json","text","formData"],u=a.Request,d=a.Response,p=a.fetch,l="prototype";u&&d&&p&&(i(f,function(t,e){r(u[l],e,c),r(d[l],e,c)}),r(a,"fetch",s),o.on(s+"end",function(t,e){var n=this;if(e){var r=e.headers.get("content-length");null!==r&&(n.rxSize=r),o.emit(s+"done",[null,e],n)}else o.emit(s+"done",[t],n)}))},{}],8:[function(t,e,n){var r=t("ee").get("history"),o=t(24)(r);e.exports=r,o.inPlace(window.history,["pushState","replaceState"],"-")},{}],9:[function(t,e,n){function r(t){function e(){c.emit("jsonp-end",[],p),t.removeEventListener("load",e,!1),t.removeEventListener("error",n,!1)}function n(){c.emit("jsonp-error",[],p),c.emit("jsonp-end",[],p),t.removeEventListener("load",e,!1),t.removeEventListener("error",n,!1)}var r=t&&"string"==typeof t.nodeName&&"script"===t.nodeName.toLowerCase();if(r){var o="function"==typeof t.addEventListener;if(o){var a=i(t.src);if(a){var u=s(a),d="function"==typeof u.parent[u.key];if(d){var p={};f.inPlace(u.parent,[u.key],"cb-",p),t.addEventListener("load",e,!1),t.addEventListener("error",n,!1),c.emit("new-jsonp",[t.src],p)}}}}}function o(){return"addEventListener"in window}function i(t){var e=t.match(u);return e?e[1]:null}function a(t,e){var n=t.match(p),r=n[1],o=n[3];return o?a(o,e[r]):e[r]}function s(t){var e=t.match(d);return e&&e.length>=3?{key:e[2],parent:a(e[1],window)}:{key:t,parent:window}}var c=t("ee").get("jsonp"),f=t(24)(c);if(e.exports=c,o()){var u=/[?&](?:callback|cb)=([^&#]+)/,d=/(.*)\.([^.]+)/,p=/^(\w+)(\.|$)(.*)$/,l=["appendChild","insertBefore","replaceChild"];f.inPlace(HTMLElement.prototype,l,"dom-"),f.inPlace(HTMLHeadElement.prototype,l,"dom-"),f.inPlace(HTMLBodyElement.prototype,l,"dom-"),c.on("dom-start",function(t){r(t[0])})}},{}],10:[function(t,e,n){var r=t("ee").get("mutation"),o=t(24)(r),i=NREUM.o.MO;e.exports=r,i&&(window.MutationObserver=function(t){return this instanceof i?new i(o(t,"fn-")):i.apply(this,arguments)},MutationObserver.prototype=i.prototype)},{}],11:[function(t,e,n){function r(t){var e=a.context(),n=s(t,"executor-",e),r=new f(n);return a.context(r).getCtx=function(){return e},a.emit("new-promise",[r,e],e),r}function o(t,e){return e}var i=t(24),a=t("ee").get("promise"),s=i(a),c=t(21),f=NREUM.o.PR;e.exports=a,f&&(window.Promise=r,["all","race"].forEach(function(t){var e=f[t];f[t]=function(n){function r(t){return function(){a.emit("propagate",[null,!o],i),o=o||!t}}var o=!1;c(n,function(e,n){Promise.resolve(n).then(r("all"===t),r(!1))});var i=e.apply(f,arguments),s=f.resolve(i);return s}}),["resolve","reject"].forEach(function(t){var e=f[t];f[t]=function(t){var n=e.apply(f,arguments);return t!==n&&a.emit("propagate",[t,!0],n),n}}),f.prototype["catch"]=function(t){return this.then(null,t)},f.prototype=Object.create(f.prototype,{constructor:{value:r}}),c(Object.getOwnPropertyNames(f),function(t,e){try{r[e]=f[e]}catch(n){}}),a.on("executor-start",function(t){t[0]=s(t[0],"resolve-",this),t[1]=s(t[1],"resolve-",this)}),a.on("executor-err",function(t,e,n){t[1](n)}),s.inPlace(f.prototype,["then"],"then-",o),a.on("then-start",function(t,e){this.promise=e,t[0]=s(t[0],"cb-",this),t[1]=s(t[1],"cb-",this)}),a.on("then-end",function(t,e,n){this.nextPromise=n;var r=this.promise;a.emit("propagate",[r,!0],n)}),a.on("cb-end",function(t,e,n){a.emit("propagate",[n,!0],this.nextPromise)}),a.on("propagate",function(t,e,n){this.getCtx&&!e||(this.getCtx=function(){if(t instanceof Promise)var e=a.context(t);return e&&e.getCtx?e.getCtx():this})}),r.toString=function(){return""+f})},{}],12:[function(t,e,n){var r=t("ee").get("raf"),o=t(24)(r),i="equestAnimationFrame";e.exports=r,o.inPlace(window,["r"+i,"mozR"+i,"webkitR"+i,"msR"+i],"raf-"),r.on("raf-start",function(t){t[0]=o(t[0],"fn-")})},{}],13:[function(t,e,n){function r(t,e,n){t[0]=a(t[0],"fn-",null,n)}function o(t,e,n){this.method=n,this.timerDuration=isNaN(t[1])?0:+t[1],t[0]=a(t[0],"fn-",this,n)}var i=t("ee").get("timer"),a=t(24)(i),s="setTimeout",c="setInterval",f="clearTimeout",u="-start",d="-";e.exports=i,a.inPlace(window,[s,"setImmediate"],s+d),a.inPlace(window,[c],c+d),a.inPlace(window,[f,"clearImmediate"],f+d),i.on(c+u,r),i.on(s+u,o)},{}],14:[function(t,e,n){function r(t,e){d.inPlace(e,["onreadystatechange"],"fn-",s)}function o(){var t=this,e=u.context(t);t.readyState>3&&!e.resolved&&(e.resolved=!0,u.emit("xhr-resolved",[],t)),d.inPlace(t,y,"fn-",s)}function i(t){g.push(t),h&&(x?x.then(a):v?v(a):(E=-E,O.data=E))}function a(){for(var t=0;t<g.length;t++)r([],g[t]);g.length&&(g=[])}function s(t,e){return e}function c(t,e){for(var n in t)e[n]=t[n];return e}t(6);var f=t("ee"),u=f.get("xhr"),d=t(24)(u),p=NREUM.o,l=p.XHR,h=p.MO,m=p.PR,v=p.SI,w="readystatechange",y=["onload","onerror","onabort","onloadstart","onloadend","onprogress","ontimeout"],g=[];e.exports=u;var b=window.XMLHttpRequest=function(t){var e=new l(t);try{u.emit("new-xhr",[e],e),e.addEventListener(w,o,!1)}catch(n){try{u.emit("internal-error",[n])}catch(r){}}return e};if(c(l,b),b.prototype=l.prototype,d.inPlace(b.prototype,["open","send"],"-xhr-",s),u.on("send-xhr-start",function(t,e){r(t,e),i(e)}),u.on("open-xhr-start",r),h){var x=m&&m.resolve();if(!v&&!m){var E=1,O=document.createTextNode(E);new h(a).observe(O,{characterData:!0})}}else f.on("fn-end",function(t){t[0]&&t[0].type===w||a()})},{}],15:[function(t,e,n){function r(t){var e=this.params,n=this.metrics;if(!this.ended){this.ended=!0;for(var r=0;r<d;r++)t.removeEventListener(u[r],this.listener,!1);if(!e.aborted){if(n.duration=a.now()-this.startTime,4===t.readyState){e.status=t.status;var i=o(t,this.lastSize);if(i&&(n.rxSize=i),this.sameOrigin){var c=t.getResponseHeader("X-NewRelic-App-Data");c&&(e.cat=c.split(", ").pop())}}else e.status=0;n.cbTime=this.cbTime,f.emit("xhr-done",[t],t),s("xhr",[e,n,this.startTime])}}}function o(t,e){var n=t.responseType;if("json"===n&&null!==e)return e;var r="arraybuffer"===n||"blob"===n||"json"===n?t.response:t.responseText;return h(r)}function i(t,e){var n=c(e),r=t.params;r.host=n.hostname+":"+n.port,r.pathname=n.pathname,t.sameOrigin=n.sameOrigin}var a=t("loader");if(a.xhrWrappable){var s=t("handle"),c=t(16),f=t("ee"),u=["load","error","abort","timeout"],d=u.length,p=t("id"),l=t(19),h=t(18),m=window.XMLHttpRequest;a.features.xhr=!0,t(14),f.on("new-xhr",function(t){var e=this;e.totalCbs=0,e.called=0,e.cbTime=0,e.end=r,e.ended=!1,e.xhrGuids={},e.lastSize=null,l&&(l>34||l<10)||window.opera||t.addEventListener("progress",function(t){e.lastSize=t.loaded},!1)}),f.on("open-xhr-start",function(t){this.params={method:t[0]},i(this,t[1]),this.metrics={}}),f.on("open-xhr-end",function(t,e){"loader_config"in NREUM&&"xpid"in NREUM.loader_config&&this.sameOrigin&&e.setRequestHeader("X-NewRelic-ID",NREUM.loader_config.xpid)}),f.on("send-xhr-start",function(t,e){var n=this.metrics,r=t[0],o=this;if(n&&r){var i=h(r);i&&(n.txSize=i)}this.startTime=a.now(),this.listener=function(t){try{"abort"===t.type&&(o.params.aborted=!0),("load"!==t.type||o.called===o.totalCbs&&(o.onloadCalled||"function"!=typeof e.onload))&&o.end(e)}catch(n){try{f.emit("internal-error",[n])}catch(r){}}};for(var s=0;s<d;s++)e.addEventListener(u[s],this.listener,!1)}),f.on("xhr-cb-time",function(t,e,n){this.cbTime+=t,e?this.onloadCalled=!0:this.called+=1,this.called!==this.totalCbs||!this.onloadCalled&&"function"==typeof n.onload||this.end(n)}),f.on("xhr-load-added",function(t,e){var n=""+p(t)+!!e;this.xhrGuids&&!this.xhrGuids[n]&&(this.xhrGuids[n]=!0,this.totalCbs+=1)}),f.on("xhr-load-removed",function(t,e){var n=""+p(t)+!!e;this.xhrGuids&&this.xhrGuids[n]&&(delete this.xhrGuids[n],this.totalCbs-=1)}),f.on("addEventListener-end",function(t,e){e instanceof m&&"load"===t[0]&&f.emit("xhr-load-added",[t[1],t[2]],e)}),f.on("removeEventListener-end",function(t,e){e instanceof m&&"load"===t[0]&&f.emit("xhr-load-removed",[t[1],t[2]],e)}),f.on("fn-start",function(t,e,n){e instanceof m&&("onload"===n&&(this.onload=!0),("load"===(t[0]&&t[0].type)||this.onload)&&(this.xhrCbStart=a.now()))}),f.on("fn-end",function(t,e){this.xhrCbStart&&f.emit("xhr-cb-time",[a.now()-this.xhrCbStart,this.onload,e],e)})}},{}],16:[function(t,e,n){e.exports=function(t){var e=document.createElement("a"),n=window.location,r={};e.href=t,r.port=e.port;var o=e.href.split("://");!r.port&&o[1]&&(r.port=o[1].split("/")[0].split("#").pop().split(":")[1]),r.port&&"0"!==r.port||(r.port="https"===o[0]?"443":"80"),r.hostname=e.hostname||n.hostname,r.pathname=e.pathname,r.protocol=o[0],"/"!==r.pathname.charAt(0)&&(r.pathname="/"+r.pathname);var i=!e.protocol||":"===e.protocol||e.protocol===n.protocol,a=e.hostname===document.domain&&e.port===n.port;return r.sameOrigin=i&&(!e.hostname||a),r}},{}],17:[function(t,e,n){function r(){}function o(t,e,n){return function(){return i(t,[f.now()].concat(s(arguments)),e?null:this,n),e?void 0:this}}var i=t("handle"),a=t(21),s=t(22),c=t("ee").get("tracer"),f=t("loader"),u=NREUM;"undefined"==typeof window.newrelic&&(newrelic=u);var d=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTrace","inlineHit","addRelease"],p="api-",l=p+"ixn-";a(d,function(t,e){u[e]=o(p+e,!0,"api")}),u.addPageAction=o(p+"addPageAction",!0),u.setCurrentRouteName=o(p+"routeName",!0),e.exports=newrelic,u.interaction=function(){return(new r).get()};var h=r.prototype={createTracer:function(t,e){var n={},r=this,o="function"==typeof e;return i(l+"tracer",[f.now(),t,n],r),function(){if(c.emit((o?"":"no-")+"fn-start",[f.now(),r,o],n),o)try{return e.apply(this,arguments)}catch(t){throw c.emit("fn-err",[arguments,this,t],n),t}finally{c.emit("fn-end",[f.now()],n)}}}};a("actionText,setName,setAttribute,save,ignore,onEnd,getContext,end,get".split(","),function(t,e){h[e]=o(l+e)}),newrelic.noticeError=function(t){"string"==typeof t&&(t=new Error(t)),i("err",[t,f.now()])}},{}],18:[function(t,e,n){e.exports=function(t){if("string"==typeof t&&t.length)return t.length;if("object"==typeof t){if("undefined"!=typeof ArrayBuffer&&t instanceof ArrayBuffer&&t.byteLength)return t.byteLength;if("undefined"!=typeof Blob&&t instanceof Blob&&t.size)return t.size;if(!("undefined"!=typeof FormData&&t instanceof FormData))try{return JSON.stringify(t).length}catch(e){return}}}},{}],19:[function(t,e,n){var r=0,o=navigator.userAgent.match(/Firefox[\/\s](\d+\.\d+)/);o&&(r=+o[1]),e.exports=r},{}],20:[function(t,e,n){function r(t,e){if(!o)return!1;if(t!==o)return!1;if(!e)return!0;if(!i)return!1;for(var n=i.split("."),r=e.split("."),a=0;a<r.length;a++)if(r[a]!==n[a])return!1;return!0}var o=null,i=null,a=/Version\/(\S+)\s+Safari/;if(navigator.userAgent){var s=navigator.userAgent,c=s.match(a);c&&s.indexOf("Chrome")===-1&&s.indexOf("Chromium")===-1&&(o="Safari",i=c[1])}e.exports={agent:o,version:i,match:r}},{}],21:[function(t,e,n){function r(t,e){var n=[],r="",i=0;for(r in t)o.call(t,r)&&(n[i]=e(r,t[r]),i+=1);return n}var o=Object.prototype.hasOwnProperty;e.exports=r},{}],22:[function(t,e,n){function r(t,e,n){e||(e=0),"undefined"==typeof n&&(n=t?t.length:0);for(var r=-1,o=n-e||0,i=Array(o<0?0:o);++r<o;)i[r]=t[e+r];return i}e.exports=r},{}],23:[function(t,e,n){e.exports={exists:"undefined"!=typeof window.performance&&window.performance.timing&&"undefined"!=typeof window.performance.timing.navigationStart}},{}],24:[function(t,e,n){function r(t){return!(t&&t instanceof Function&&t.apply&&!t[a])}var o=t("ee"),i=t(22),a="nr#original",s=Object.prototype.hasOwnProperty,c=!1;e.exports=function(t,e){function n(t,e,n,o){function nrWrapper(){var r,a,s,c;try{a=this,r=i(arguments),s="function"==typeof n?n(r,a):n||{}}catch(f){p([f,"",[r,a,o],s])}u(e+"start",[r,a,o],s);try{return c=t.apply(a,r)}catch(d){throw u(e+"err",[r,a,d],s),d}finally{u(e+"end",[r,a,c],s)}}return r(t)?t:(e||(e=""),nrWrapper[a]=t,d(t,nrWrapper),nrWrapper)}function f(t,e,o,i){o||(o="");var a,s,c,f="-"===o.charAt(0);for(c=0;c<e.length;c++)s=e[c],a=t[s],r(a)||(t[s]=n(a,f?s+o:o,i,s))}function u(n,r,o){if(!c||e){var i=c;c=!0;try{t.emit(n,r,o,e)}catch(a){p([a,n,r,o])}c=i}}function d(t,e){if(Object.defineProperty&&Object.keys)try{var n=Object.keys(t);return n.forEach(function(n){Object.defineProperty(e,n,{get:function(){return t[n]},set:function(e){return t[n]=e,e}})}),e}catch(r){p([r])}for(var o in t)s.call(t,o)&&(e[o]=t[o]);return e}function p(e){try{t.emit("internal-error",e)}catch(n){}}return t||(t=o),n.inPlace=f,n.flag=a,n}},{}],ee:[function(t,e,n){function r(){}function o(t){function e(t){return t&&t instanceof r?t:t?c(t,s,i):i()}function n(n,r,o,i){if(!p.aborted||i){t&&t(n,r,o);for(var a=e(o),s=m(n),c=s.length,f=0;f<c;f++)s[f].apply(a,r);var d=u[g[n]];return d&&d.push([b,n,r,a]),a}}function l(t,e){y[t]=m(t).concat(e)}function h(t,e){var n=y[t];if(n)for(var r=0;r<n.length;r++)n[r]===e&&n.splice(r,1)}function m(t){return y[t]||[]}function v(t){return d[t]=d[t]||o(n)}function w(t,e){f(t,function(t,n){e=e||"feature",g[n]=e,e in u||(u[e]=[])})}var y={},g={},b={on:l,addEventListener:l,removeEventListener:h,emit:n,get:v,listeners:m,context:e,buffer:w,abort:a,aborted:!1};return b}function i(){return new r}function a(){(u.api||u.feature)&&(p.aborted=!0,u=p.backlog={})}var s="nr#context",c=t("gos"),f=t(21),u={},d={},p=e.exports=o();p.backlog=u},{}],gos:[function(t,e,n){function r(t,e,n){if(o.call(t,e))return t[e];var r=n();if(Object.defineProperty&&Object.keys)try{return Object.defineProperty(t,e,{value:r,writable:!0,enumerable:!1}),r}catch(i){}return t[e]=r,r}var o=Object.prototype.hasOwnProperty;e.exports=r},{}],handle:[function(t,e,n){function r(t,e,n,r){o.buffer([t],r),o.emit(t,e,n)}var o=t("ee").get("handle");e.exports=r,r.ee=o},{}],id:[function(t,e,n){function r(t){var e=typeof t;return!t||"object"!==e&&"function"!==e?-1:t===window?0:a(t,i,function(){return o++})}var o=1,i="nr#id",a=t("gos");e.exports=r},{}],loader:[function(t,e,n){function r(){if(!E++){var t=x.info=NREUM.info,e=l.getElementsByTagName("script")[0];if(setTimeout(u.abort,3e4),!(t&&t.licenseKey&&t.applicationID&&e))return u.abort();f(g,function(e,n){t[e]||(t[e]=n)}),c("mark",["onload",a()+x.offset],null,"api");var n=l.createElement("script");n.src="https://"+t.agent,e.parentNode.insertBefore(n,e)}}function o(){"complete"===l.readyState&&i()}function i(){c("mark",["domContent",a()+x.offset],null,"api")}function a(){return O.exists&&performance.now?Math.round(performance.now()):(s=Math.max((new Date).getTime(),s))-x.offset}var s=(new Date).getTime(),c=t("handle"),f=t(21),u=t("ee"),d=t(20),p=window,l=p.document,h="addEventListener",m="attachEvent",v=p.XMLHttpRequest,w=v&&v.prototype;NREUM.o={ST:setTimeout,SI:p.setImmediate,CT:clearTimeout,XHR:v,REQ:p.Request,EV:p.Event,PR:p.Promise,MO:p.MutationObserver};var y=""+location,g={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net",agent:"js-agent.newrelic.com/nr-spa-1099.min.js"},b=v&&w&&w[h]&&!/CriOS/.test(navigator.userAgent),x=e.exports={offset:s,now:a,origin:y,features:{},xhrWrappable:b,userAgent:d};t(17),l[h]?(l[h]("DOMContentLoaded",i,!1),p[h]("load",r,!1)):(l[m]("onreadystatechange",o),p[m]("onload",r)),c("mark",["firstbyte",s],null,"api");var E=0,O=t(23)},{}]},{},["loader",2,15,5,3,4]);</script><TITLE>Sito Webex Enterprise</TITLE>
<meta http-equiv="Pragma" content="no-cache">
<script language="JavaScript">
function setCookie(name,value)
{
var Days = 30;
var exp = new Date();
exp.setTime(exp.getTime() + Days*24*60*60*1000);
document.cookie = name + "="+ escape (value) + ";expires=" + exp.toGMTString()+";path=/";
}
function getCookie(Name)
{
var search = Name + "=";
if (document.cookie.length > 0)
{ // if there are any cookies
offset = document.cookie.indexOf(search);
if (offset != -1)
{ // if cookie exists
offset += search.length; // set index of beginning of value
end = document.cookie.indexOf(";", offset); // set index of end of cookie value
if (end == -1)
end = document.cookie.length;
return unescape(document.cookie.substring(offset, end));
}
}
}
//default page should never load inside of another frame
if (top.location != self.location) {
top.location = self.location;
}
var oneDay= 1*24*60*60*1000;
var expDate = new Date();
expDate.setTime (expDate.getTime() + oneDay);
var cookieExpires = expDate.toGMTString();
document.cookie="verifyCookie=test; expires="+cookieExpires
if (document.cookie.length<=0 || getCookie("verifyCookie") == null){
window.open('https://campus.webex.com/mw3300/mywebex/jsp/common/warningnote.jsp?siteurl=campus', 'Warning', 'toolbar=no,menubar=no,status=no,scrollbars=auto,resizable=yes,width=300,height=220');
}
document.cookie="verifyCookie=CLEAR; expires=Sun, 09-Nov-97 01:00:00 GMT";
try{
if('/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("meetinginfo")!=-1||( '/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("e.do")!=-1&&'/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("siteurl")!=-1)&&'/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("landingpage.do")==-1&&'/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("mainframe.do")==-1&&'/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("mywebex")==-1&&'/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("frame.do")==-1){
setCookie("jmtlogloginclicktime",new Date().getTime());
}
}catch(ex){
}
var dom = document.getElementById ? 1 : 0;
var ns4 = (document.layers && !dom ) ? 1 : 0;
// do for ns4 resize problem
function mm_reloadPage(init) { //reloads the window if Nav4 resized
if (init==true) {
with (navigator) {
if ((appName=="Netscape") && (parseInt(appVersion)==4)) {
document.mm_pgW=innerWidth;
document.mm_pgH=innerHeight;
onresize=mm_reloadPage;
}
}
} else if (innerWidth!=document.mm_pgW || innerHeight!=document.mm_pgH) {
location.reload();
}
}
if(ns4)
mm_reloadPage(true);
document.cookie = "screenWidth=" + screen.width + "; path=/; secure";
function closeWindow() {
window.close(opener=0);
}
function submitChildFrame(){
window.frames["mainFrame"].postChildForm("\x2fmw3300\x2fmywebex\x2floginframe.do\x3fsiteurl\x3dcampus\x26rnd\x3d0.8746987491314661");
}
function submitChildFrame4Header(){
window.frames["header"].postChildForm4Logout();
}
</script>
<base href="https://campus.webex.com/mw3300/mywebex/jsp/frame/mywebex.jsp">
</HEAD>
<!-- CDN Host: akamaicdn.webex.com Status: OK -->
<FRAMESET id="topframeset" BORDER=0 FRAMEBORDER=0 FRAMESPACING=0 ROWS="131,*,0">
<FRAME SCROLLING="auto" NORESIZE NAME="header" SRC="/mw3300/mywebex/header.do?service=10&siteurl=campus&rnd=0.513406995989277" title="The header frame of Cisco WebEx Meetings">
<FRAME SCROLLING="auto" NORESIZE NAME="mainFrame" SRC="/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661" target="_top" title="The content frame of Cisco WebEx Meetings">
<FRAME SCROLLING="no" NORESIZE NAME="rotation" SRC="/mw3300/mywebex/frame/clientpath.do?siteurl=campus" title="The clientPath frame of Cisco WebEx Meetings">
</FRAMESET>
<noframes>
<h2>Spiacenti.</h2>
<p><b>Webex richiede l'uso di Netscape Navigator 4.0, Internet Explorer 4.0 o versioni successive.</b></p>
</noframes>
</html>
You could try using the By module:
from selenium.webdriver.common.by import By
from selenium import webdriver
browser = webdriver.Firefox()
browser.get("https://www.theurl.com")
browser.find_element(By.NAME,"userName").send_keys("Test")
Thank you guys. Lukas's solution works fine ed I log in. But I don't understand why it works only for the link you used. In fact if I use this other link "https://campus.webex.com/mw3300/mywebex/default.do?siteurl=campus&service=7", that I need 'cos there are more functions, It doesn't work! the same code doesn't work! Why???
I switched to Python 2.7 but there are some ASCII problems.... Maybe this the reason??
This is my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Firefox()
delay = 100 # seconds
browser.get("https://campus.webex.com/mw3300/mywebex/default.do?siteurl=campus&service=7")
myelem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, 'mwx-ipt-username')))
utente = browser.find_element_by_id("mwx-ipt-username").send_keys('User')
psw = browser.find_element_by_id("mwx-ipt-password").send_keys('Psw')
myelem.submit()
but this code doesn't find the elements... why??
bye

Extract URL from webpage and save to disk

I am trying to write a script to automaotmcally query sci-hub.io with an article's title and save a PDF copy of the articles full text to my computer with a specific file name.
To do this I have written the following code:
url = "http://sci-hub.io/"
data = read_csv("C:\\Users\\Sangeeta's\\Downloads\\distillersr_export (1).csv")
for index, row in data.iterrows():
try:
print('http://sci-hub.io/' + str(row['DOI']))
res = requests.get('http://sci-hub.io/' + str(row['DOI']))
print(res.content)
except:
print('NO DOI: ' + str(row['ref']))
This opens a CSV file with a list of DOI's and names of the file to be saved. For each DOI, it then queries sci-hub.io for the full-text. The presented page embeds the PDF in however I am now unsure how to extract the URL for the PDF and save it to disk.
An example of the page can be seen in the image below:
In this image, the desired URL is http://dacemirror.sci-hub.io/journal-article/3a257a9ec768d1c3d80c066186aba421/pajno2010.pdf.
How can I automatically extract this URL and then save the PDF file to disk?
When I print res.content, I get this:
b'<!DOCTYPE html>\n<html>\n <head>\n <title></title>\n <meta charset="UTF-8">\n <meta name="viewport" content="width=device-width">\n </head>\n <body>\n <style type = "text/css">\n body {background-color:#F0F0F0}\n div {overflow: hidden; position: absolute;}\n #top {top:0;left:0;width:100%;height:50px;font-size:14px} /* 40px */\n #content {top:50px;left:0;bottom:0;width:100%}\n p {margin:0;padding:10px}\n a {font-size:12px;font-family:sans-serif}\n a.target {font-weight:normal;color:green;margin-left:10px}\n a.reopen {font-weight:normal;color:blue;text-decoration:none;margin-left:10px}\n iframe {width:100%;height:100%}\n \n p.agitation {padding-top:5px;font-size:20px;text-align:center}\n p.agitation a {font-size:20px;text-decoration:none;color:green}\n\n .banner {position:absolute;z-index:9999;top:400px;left:0px;width:300px;height:225px;\n border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px}\n .banner img {border:0}\n \n p.donate {padding:0;margin:0;padding-top:5px;text-align:center;background:green;height:40px}\n p.donate a {color:white;font-weight:bold;text-decoration:none;font-size:20px}\n\n #save {position:absolute;z-index:9999;top:180px;left:8px;width:210px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:#F0F0F0;color:#333}\n\n #save a {text-decoration:none;color:white;font-size:inherit;color:#666}\n\n #save p { margin: 0; padding: 0; margin-top: 8px}\n\n #reload {position:absolute;z-index:9999;top:240px;left:8px;width:210px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:#F0F0F0;color:#333}\n\n #reload a {text-decoration:none;color:white;font-size:inherit;color:#666}\n\n #reload p { margin: 0; padding: 0; margin-top: 8px}\n\n\n #saveastro {position:absolute;z-index:9999;top:360px;left:8px;width:230px;height:70px;\n border-radius: 4px; border: solid 1px #ccc; background: white; text-align:center}\n #saveastro p { margin: 0; padding: 0; margin-top: 16px}\n \n \n #donate {position:absolute;z-index:9999;top:170px;right:16px;width:220px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:white;color:#333}\n \n #donate a {text-decoration:none;color:green;font-size:inherit}\n\n #donatein {position:absolute;z-index:9999;top:220px;right:16px;width:220px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:green;color:#333}\n\n #donatein a {text-decoration:none;color:white;font-size:inherit}\n \n #banner {position:absolute;z-index:9999;top:50%;left:45px;width:250px;height:250px; padding: 0; border: solid 1px white; border-radius: 4px}\n \n </style>\n \n \n \n <script type = "text/javascript">\n window.onload = function() {\n var url = document.getElementById(\'url\');\n if (url.innerHTML.length > 77)\n url.innerHTML = url.innerHTML.substring(0,77) + \'...\';\n };\n </script>\n <div id = "top">\n \n <p class="agitation" style = "padding-top:12px">\n \xd0\xa1\xd1\x82\xd1\x80\xd0\xb0\xd0\xbd\xd0\xb8\xd1\x87\xd0\xba\xd0\xb0 \xd0\xbf\xd1\x80\xd0\xbe\xd0\xb5\xd0\xba\xd1\x82\xd0\xb0 Sci-Hub \xd0\xb2 \xd1\x81\xd0\xbe\xd1\x86\xd0\xb8\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xbd\xd1\x8b\xd1\x85 \xd1\x81\xd0\xb5\xd1\x82\xd1\x8f\xd1\x85 \xe2\x86\x92 <a target="_blank" href="https://vk.com/sci_hub">vk.com/sci_hub</a>\n </p>\n \n </div>\n \n <div id = "content">\n <iframe src = "http://moscow.sci-hub.io/202d9ebdfbb8c0c56964a31b2fdfe8e9/roerdink2016.pdf" id = "pdf"></iframe>\n </div>\n \n <div id = "donate">\n <p><a target = "_blank" href = "//sci-hub.io/donate">\xd0\xbf\xd0\xbe\xd0\xb4\xd0\xb4\xd0\xb5\xd1\x80\xd0\xb6\xd0\xb0\xd1\x82\xd1\x8c \xd0\xbf\xd1\x80\xd0\xbe\xd0\xb5\xd0\xba\xd1\x82 →</a></p>\n </div>\n <div id = "donatein">\n <p><a target = "_blank" href = "//sci-hub.io/donate">support the project →</a></p>\n </div>\n <div id = "save">\n <p>\xe2\x87\xa3 \xd1\x81\xd0\xbe\xd1\x85\xd1\x80\xd0\xb0\xd0\xbd\xd0\xb8\xd1\x82\xd1\x8c \xd1\x81\xd1\x82\xd0\xb0\xd1\x82\xd1\x8c\xd1\x8e</p>\n </div>\n <div id = "reload">\n <p>↻ \xd1\x81\xd0\xba\xd0\xb0\xd1\x87\xd0\xb0\xd1\x82\xd1\x8c \xd0\xb7\xd0\xb0\xd0\xbd\xd0\xbe\xd0\xb2\xd0\xbe</p>\n </div>\n \n \n<!-- Yandex.Metrika counter --> <script type="text/javascript"> (function (d, w, c) { (w[c] = w[c] || []).push(function() { try { w.yaCounter10183018 = new Ya.Metrika({ id:10183018, clickmap:true, trackLinks:true, accurateTrackBounce:true, ut:"noindex" }); } catch(e) { } }); var n = d.getElementsByTagName("script")[0], s = d.createElement("script"), f = function () { n.parentNode.insertBefore(s, n); }; s.type = "text/javascript"; s.async = true; s.src = "https://mc.yandex.ru/metrika/watch.js"; if (w.opera == "[object Opera]") { d.addEventListener("DOMContentLoaded", f, false); } else { f(); } })(document, window, "yandex_metrika_callbacks"); </script> <noscript><div><img src="https://mc.yandex.ru/watch/10183018?ut=noindex" style="position:absolute; left:-9999px;" alt="" /></div></noscript> <!-- /Yandex.Metrika counter -->\n </body>\n</html>\n'
Which does include the URL, however I am unsure how to extract it.
Update:
I am now able to extract the URL but when I try to access the page with the PDF (through urllib.request) I get a 403 response even though the URL is valid. Any ideas on why and how to fix? (I am able to access through my browser so not IP blocked)
You can use urllib library to access the html of the page and even download files, and regex to find the url of the file you want to download.
import urllib
import re
site = urllib.urlopen(".../index.html")
data = site.read() # turns the contents of the site into a string
files = re.findall('(http|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,#?^=%&:/~+#-]*[\w#?^=%&/~+#-])?(.pdf)', data) # finds the url
for file in files:
urllib.urlretrieve(file, filepath) # "filepath" is where you want to save it
Here is the Solution:-
url = re.search('<iframe src = "\s*([^"]+)"', res.content)
url.group(1)
urllib.urlretrieve(url.group(1),'C:/.../Docs/test.pdf')
I ran it and it is working :)
For Python 3:
Change urrlib.urlretrive to urllib.request.urlretrieve
You can do it with a clunky code requiring selenium, requests and scrapy.
Use selenium to request either an article title or DOI.
>>> from selenium import webdriver
>>> driver.get("http://sci-hub.io/")
>>> input_box = driver.find_element_by_name('request')
>>> input_box.send_keys('amazing scientific results\n')
An article by the title 'amazing scientific results' doesn't seem to exist. As a result, the site returns a diagnostic page in the browser window which we can ignore. It also puts 'http://sci-hub.io/' in webdriver's current_url property. This is helpful because it's an indication that the requested result isn't available.
>>> driver.current_url
'http://sci-hub.io/'
Let's try again, looking for the item that you know exists.
>>> driver.get("http://sci-hub.io/")
>>> input_box = driver.find_element_by_name('request')
>>> input_box.send_keys('DOI: 10.1016/j.anai.2016.01.022\n')
>>> driver.current_url
'http://sci-hub.io/10.1016/j.anai.2016.01.022'
This time the site returns a distinctive url. Unfortunately, if we load this using selenium we will get the pdf and, unless you're more able than I am, you will find it difficult to download this to a file on your machine.
Instead, I download it using the requests library. Loaded in this form you will find that the url of the pdf becomes visible in the HTML.
>>> import requests
>>> r = requests.get(driver.current_url)
To ferret out the url I use scrapy.
>>> from scrapy.selector import Selector
>>> selector = Selector(text=r.text)
>>> pdf_url = selector.xpath('.//iframe/#src')[0].extract()
Finally I use requests again to download the pdf so that I can save it to a conveniently named file on local storage.
>>> r = requests.get(pdf_url).content
>>> open('article_name', 'wb').write(r)
211853
I solved this using a combination of the answers above - namely SBO7 & Roxerg.
I use the following to extract the URL from the page and then download the PDF:
res = requests.get('http://sci-hub.io/' + str(row['DOI']))
useful = BeautifulSoup(res.content, "html5lib").find_all("iframe")
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(useful[0]))
response = requests.get(urls[0])
with open("C:\\Users\\Sangeeta's\\Downloads\\ref\\" + str(row['ref']) + '.pdf', 'wb') as fw:
fw.write(response.content)
Note: This will not work for all articles - some link to webpages (example) and this doesn't correctly work for those.

Categories

Resources