Scrape "Button" tag with Selenium - python
import requests
from selenium import webdriver
import bs4
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)
oLat = 33.8026087
oLong = -84.3369491999999
dLat = 33.79149
dLong = -84.32312
url = "https://ride.lyft.com/ridetype?origin=" + str(oLat) + "%2C" + str(oLong) + "&destination=" + str(dLat) + "%2C" + str(dLong) + "&ride_type=&offerProductId=standard"
driver.get(url)
content = driver.page_source
soup = bs4.BeautifulSoup(content)
print(soup)
print(url)
Here is my code currently. I am trying to scrape the lyft price estimate.
The data is in the "button" tag. This does not show up in the html from the code I provided above. How can I get this data to show up?
import requests
from selenium import webdriver
import bs4
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)
oLat = 33.7885662
oLong = -84.326684
dLat = 33.4486296
dLong = -84.4550443
url = "https://ride.lyft.com/ridetype?origin=" + str(oLat) + "%2C" + str(oLong) + "&destination=" + str(dLat) + "%2C" + str(dLong) + "&ride_type=&offerProductId=standard"
driver.get(url)
spanThing = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR , "span.sc-7e9e68d9-0 lctkqn")))
print(spanThing)
driver.quit()
I tried this additional code, but it doesn't find the span and class for some reason. I'm not sure why
To extract the Page Source you need to induce WebDriverWait for the visibility_of_element_located() of a static element and you can use the following locator strategies:
oLat = 33.8026087
oLong = -84.3369491999999
dLat = 33.79149
dLong = -84.32312
url = "https://ride.lyft.com/ridetype?origin=" + str(oLat) + "%2C" + str(oLong) + "&destination=" + str(dLat) + "%2C" + str(dLong) + "&ride_type=&offerProductId=standard"
driver.get(url)
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//span[contains(., 'Sign up / Log in to request ride')]")))
print(driver.page_source)
driver.quit()
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Console Output:
<html lang="en-US" class="js-focus-visible" data-js-focus-visible=""><head><meta name="viewport" content="width=device-width"><script type="module">
if (window.performance) {
const toSnake = (str) => str.replace(/([A-Z])/g, function($1) {return '_' + $1.toLowerCase();});
const measure = () => {
const { timing } = window.performance;
if (!timing.navigationStart) return;
const al = [
'event_name','sending_service','connection_end','connection_start','dom_complete',
'dom_content_loaded_event_end','dom_content_loaded_event_start','dom_interactive',
'dom_loading','domain_lookup_end','domain_lookup_start','fetch_start','load_event_end',
'load_event_start','navigation_start','redirect_end','redirect_start','request_start',
'response_end','response_start','secure_connection_start','unload_event_end',
'unload_event_start','connect_start','connect_end','ms_first_paint','source','uri_path',
'request_end','code','track_id','uri_href'
];
const { href = '', pathname = '' } = window.location;
const sE = { event_name: 'navigation_timing_absolute', uri_href: href, uri_path: pathname, sending_service: 'riderweb', source: 'riderweb' };
for (let eN in timing) {
const sEN = toSnake(eN);
if (al.includes(sEN)) { sE[sEN] = timing[eN]; }
}
// iOS 11 supports ES modules, but sendBeacon not available until 11.3.
if (navigator.sendBeacon) {
navigator.sendBeacon('https://www.lyft.com/api/track', JSON.stringify(sE));
}
};
try {
if (document.readyState === 'complete') {
measure();
} else {
window.addEventListener('load', measure);
}
} catch(e) {}
}
</script><script>
var _i18n_extends = Object.assign || function (target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i]; for (var key in source) { if (Object.prototype.hasOwnProperty.call(source, key)) { target[key] = source[key]; } } } return target; };
;if(!window.__TRANSLATIONS__) window.__TRANSLATIONS__ = {};
window.__TRANSLATIONS__.locale = "en-US";
window.__TRANSLATIONS__.bundleName = "common";
if (!window.__TRANSLATIONS__.data) window.__TRANSLATIONS__.data = {};
_i18n_extends(window.__TRANSLATIONS__.data, {"%;":{"s":"OK"},"#":{"s":"Sorry, we can't find that page"},"$":{"s":"Sorry, there was an error"},"%":{"s":"Back"},"A":{"s":"No tip"},"T":{"s":"Lyft: Request a ride on the web"},"p":{"s":"Current location"},"q":{"s":"You set your pickup as \"Your Location\"{originatingAppMsg}"},"r":{"s":" in Google Maps"},"s":{"s":"To use the same pickup location, Lyft needs access to your current location."},"t":{"s":"Share your location"},"u":{"s":"Location sharing is denied"},"w":{"s":"Submit"},"x":{"s":"Save"},"y":{"s":"Confirm"},"z":{"s":"Unknown error"},"{":{"s":"Close"},"|":{"s":"Cancel"},"}":{"s":"Edit"},"~":{"s":"Delete"},"! ":{"s":"Done"},"!!":{"s":"Log out"},"!#":{"s":"Are you sure you want to log out?"},"!%":{"s":"Payment defaults"},"!&":{"s":"Add a payment method to get started."},"!(":{"s":"Add new card"},"!)":{"s":"Could not update payment method"},"!*":{"s":"Payment"},"!+":{"s":"manage your payment methods"},"!,":{"s":"Payment method"},"!-":{"s":"Card failed!"},"!.":{"s":"Payment method not supported on ride.lyft.com."},"!\u002F":{"s":"Payment method updated across Lyft apps."},"!0":{"s":"You cannot delete your only valid payment method."},"!1":{"s":"Gift cards"},"!2":{"s":"redeem gift cards"},"!3":{"s":"This field is required"},"!4":{"s":"Something went wrong. Please try again."},"!5":{"s":"Click to log out or switch accounts"},"!6":{"s":"Go back"},"!Z":{"s":"Schedule"},"!k":{"s":"schedule a ride"},"(6":{"s":"Ride"},"(7":{"s":"Rent"},"(8":{"s":"Rent a car through Lyft or our partner Sixt"},"(9":{"s":"Help"},"(:":{"s":"Business"},"(;":{"s":"Upcoming rides"},"(\u003C":{"s":"Install on Phone"},"(=":{"s":"Sign up \u002F Log in"},"(\u003E":{"s":"Log in"},"(l":{"s":"Install app"},"(m":{"s":"Free"},")z":{"s":"Not now"},"){":{"s":"Get the Lyft app"},")|":{"s":"More travel options from the palm of your hand"},")}":{"s":"From bikes to rentals and everything in between. If it gets you there, it's in the app."},"*\u003E":{"s":"Install on Desktop"},"*?":{"s":"Install on Desktop. It's free and takes up no space on your device"},"*C":{"s":"Text me a link"},"*D":{"s":"We'll send you a text with a link to download the app."},"*E":{"s":"Enter mobile phone number"},"*F":{"s":"Phone invalid"},"*G":{"s":"Refresh"},"*H":{"s":"An update is available"},",+":{"s":"View profile"},",,":{"s":"Get a ride"},",-":{"s":"Rides"},",.":{"s":"Gift cards"},",\u002F":{"s":"Promos"},",0":{"s":"Donate"},",1":{"s":"Invite friends"},",2":{"s":"Help"},",3":{"s":"Settings"},",4":{"s":"Safety Tools"},",5":{"s":"Lyft Rentals"},")d":{"s":"Log in \u002F Sign up"},")e":{"s":"You will need to log in to {action}!"},")f":{"s":"Log in"},")g":{"s":"Cancel"},"a":{"s":"Lyft and OpenStreetMap watermark"},"#L":{"s":"add promotions"},"&^":{"s":"Just now"},"&`.zero":{"s":"{minutes} minutes ago"},"&_.one":{"s":"{minutes} minute ago"},"&`.two":{"s":"{minutes} minutes ago"},"&`.few":{"s":"{minutes} minutes ago"},"&`.many":{"s":"{minutes} minutes ago"},"&`.other":{"s":"{minutes} minutes ago"},"&b.zero":{"s":"{hours} hours ago"},"&a.one":{"s":"{hours} hour ago"},"&b.two":{"s":"{hours} hours ago"},"&b.few":{"s":"{hours} hours ago"},"&b.many":{"s":"{hours} hours ago"},"&b.other":{"s":"{hours} hours ago"},"&d.zero":{"s":"{days} days ago"},"&c.one":{"s":"{days} day ago"},"&d.two":{"s":"{days} days ago"},"&d.few":{"s":"{days} days ago"},"&d.many":{"s":"{days} days ago"},"&d.other":{"s":"{days} days ago"},"&e":{"s":"Less than a minute"},"&g.zero":{"s":"{minutes} Total minutes"},"&f.one":{"s":"{minutes} Total minute"},"&g.two":{"s":"{minutes} Total minutes"},"&g.few":{"s":"{minutes} Total minutes"},"&g.many":{"s":"{minutes} Total minutes"},"&g.other":{"s":"{minutes} Total minutes"},"&i.zero":{"s":"{hours} Total hours"},"&h.one":{"s":"{hours} Total hour"},"&i.two":{"s":"{hours} Total hours"},"&i.few":{"s":"{hours} Total hours"},"&i.many":{"s":"{hours} Total hours"},"&i.other":{"s":"{hours} Total hours"},"&k.zero":{"s":"{days} Total days"},"&j.one":{"s":"{days} Total day"},"&k.two":{"s":"{days} Total days"},"&k.few":{"s":"{days} Total days"},"&k.many":{"s":"{days} Total days"},"&k.other":{"s":"{days} Total days"},"(a":{"s":"Any fare exceeding your Lyft Cash balance will be charged to your default payment method."},"(|":{"s":"Total"},"(}":{"s":"You'll pay this price unless you add a stop, change your destination, or if credit expires."},"(~":{"s":"This is an estimated range for your trip."},") ":{"s":"\u003CLink\u003ELog in\u003C\u002FLink\u003E or sign up to lock in your price and request a ride."},")?":{"s":"Driver Name:"},")#":{"s":"Driver's car image"},")A":{"s":"License Plate Number:"},")B":{"s":"Pick up"},")C":{"s":"Picked up"},")D":{"s":"Drop-off"},")E":{"s":"Dropped off"},")F":{"s":"Current location"},")c":{"s":"Close banner"},")y":{"s":"Riders"},"*I":{"s":"Add card"},"*J":{"s":"Edit {cardLabel}"},"+2":{"s":"$10"},"+3":{"s":"$8"},"+4":{"s":"$10"},"+5":{"s":"Unlimited 180-min classic rides for 24 hours"},"+6":{"s":"$15"},"+7":{"s":"Unlimited 30-min classic rides for 24 hours"},"+8":{"s":"Your payment info will be stored securely."},",#":{"s":"Please follow \u003CSupportLink\u003Ethese instructions\u003C\u002FSupportLink\u003E to allow this site to show notifications."},",$":{"s":"Notifications are blocked"},",\u003C":{"s":"Session expired"},",=":{"s":"You have been logged out. Please log back in to continue."},"!u":{"s":"Click to edit your pickup location"},"%\u002F":{"s":"You must \u003CLink\u003Elog in\u003C\u002FLink\u003E to {action}."},"&J":{"s":"Something went wrong. Unable to load your referral history. Please try again."},"7f523512b795a02fd9b9b05a1e22ff9b":{"s":"Card number"},"3effb3a930ea2ce61705bffc624e19b6":{"s":"Expiration"},"755c8f863223ae3f7ac0ac1cfe8b3072":{"s":"Name on card"},"22b715147b81b76566fa183406659069":{"s":"Country"},"4b3d5e03b24b6bbc630d15ad2251755f":{"s":"Billing address"},"e0a8872668d31bb76156a8d80a5d7a6c":{"s":"City"},"f420cf2cf310bbff1ead064745e66ec1":{"s":"State"},"8e9d206ff46216065a42a3953a63bd9f":{"s":"Province \u002F Territory"},"9dca7ddd59d7aca64aae58c7a99e16ce":{"s":"State \u002F Province"},"50be4be10369e747d757e7b2db2c9ed3":{"s":"Zip code"},"11ceb56a912fd18cc9ea1054c5405c13":{"s":"Postal code"},"5a0a89ab4fd1ceebfd9f68b88d27e685":{"s":"Save"},"45c9b92858c6ce6b50c1967661063ae8":{"s":"Cancel"},"29fc403cabcebe790ddd09c592f7e7cd":{"s":"There was a problem reading your card details. Please try again."},"1ae24aeff3771f629b2f865074b68050":{"s":"You must be logged in to add a payment method."},"275c89584bcddfbf0019d8d5a2ce6128":{"s":"You must be logged in to edit a payment method."},"2a420e791e0ec6d47cb64d5fab8376a9":{"s":"Please fill out all required fields"},"a966a08942254351695c6993e781301e":{"s":"Something went wrong. Please check your information and try again"}});
</script><meta charset="utf-8"><meta content="IE=Edge" http-equiv="X-UA-Compatible"><meta name="google" content="notranslate"><meta http-equiv="Accept-CH" content="DPR, Viewport-Width, Width, Downlink, Save-Data, Content-DPR"><link rel="home" href="https://ride.lyft.com"><link rel="canonical" href="https://ride.lyft.com"><link rel="icon" href="https://cdn.lyft.com/static/www-meta-assets/favicon.ico"><link rel="shortcut icon" sizes="192x192" href="https://cdn.lyft.com/static/riderweb/images/icons/icon-192x192.png"><link rel="apple-touch-startup-image" href="https://cdn.lyft.com/static/riderweb/images/icons/icon-192x192.png"><link rel="apple-touch-icon" href="https://cdn.lyft.com/static/riderweb/images/icons/icon-192x192.png"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="black-translucent"><meta property="og:title" content="Lyft: Request a ride on the web"><meta property="og:url" content="https://ride.lyft.com"><meta name="twitter:card" content="summary_large_image"><meta name="twitter:site" content="#lyft"><meta name="msapplication-starturl" content="https://ride.lyft.com"><link rel="stylesheet" href="https://cdn.lyft.com/coreui/base.4.6.5.css"><meta name="google-site-verification" content="V9fk-oLTj9Ewu7Kc6Vetf94qp8HZ3gfjxFMkn8LmZ3Y"><link rel="manifest" href="/manifest.json" crossorigin="use-credentials"><meta name="theme-color" content="#FFFFFF"><meta name="description" content="Request a Lyft ride in a web browser on your phone, tablet, or laptop – no app download required. Get a ride from a friendly driver in minutes."><meta property="og:description" content="Request a Lyft ride in a web browser on your phone, tablet, or laptop – no app download required. Get a ride from a friendly driver in minutes."><meta property="og:image" content="/images/share.png">
.
,
<next-route-announcer><p aria-live="assertive" id="__next-route-announcer__" role="alert" style="border: 0px; clip: rect(0px, 0px, 0px, 0px); height: 1px; margin: -1px; overflow: hidden; padding: 0px; position: absolute; width: 1px; white-space: nowrap; overflow-wrap: normal;"></p></next-route-announcer><iframe name="__privateStripeMetricsController9540" frameborder="0" allowtransparency="true" scrolling="no" role="presentation" allow="payment *" src="https://js.stripe.com/v3/m-outer-93afeeb17bc37e711759584dbfc50d47.html#url=https%3A%2F%2Fride.lyft.com%2Fridetype%3Forigin%3D33.8026087%252C-84.3369491999999%26destination%3D33.79149%252C-84.32312%26ride_type%3D%26offerProductId%3Dstandard&title=Lyft%3A%20Price%20estimate&referrer=&muid=NA&sid=NA&version=6&preview=false" aria-hidden="true" tabindex="-1" style="border: none !important; margin: 0px !important; padding: 0px !important; width: 1px !important; min-width: 100% !important; overflow: hidden !important; display: block !important; visibility: hidden !important; position: fixed !important; height: 1px !important; pointer-events: none !important; user-select: none !important;"></iframe></body></html>
Related
Handling Date picker using Selenium
<input placeholder="MM/DD/YYYY" autocomplete="on" type="text" class="form-control" value="01/01/2020" style="height: 40px; color: (25, 25, 25); font-weight: bold; font-size: 14px; background: > error: Message: element not interactable
Can you check this #You can select the datepicker based on the XPath index [1][2] date_input = driver.find_element_by_xpath('((//input[#type='text']))') date_input.click() date_input.send_keys(Keys.CONTROL, "a") date_input.send_keys(Keys.BACKSPACE) date_input.send_keys("02/14/2020",Keys.RETURN)
This is my python script for date picker. Hope this can be useful in someways. from selenium import webdriver #set chromodriver.exe path driver = webdriver.Chrome(executable_path="C:\\chromedriver.exe") driver.implicitly_wait(0.5) #launch URL driver.get("https://jqueryui.com/datepicker/") #switch to frame l = driver.find_element_by_xpath("//iframe[#class='demo-frame']") driver.switch_to.frame(l); #identify element inside frame d= driver.find_element_by_id("datepicker") d.click() #identify list of all dates m = driver.find_elements_by_xpath("//table/tbody/tr/td") #iterate over list for i in m: #verify required date then click if i.text == '3': i.click() break #get selected date s = d.get_attribute('value') print("Date entered is: ") print(s) #browser quit driver.quit()
Print out selenium text variable
I have a function which extracts data from a Twitter page however when the script completes I receive no outputs. The function is meant to output various information from a tweet. Im just trying to print out the second tweet on the page. card definition Function def get_tweet_data(card): username - card.find_element_by_xpath(".//span").text handle = card.find_element_by_xpath('.//span[contains(text(), "#" )]').text # try: postdate = card.find_element_by_xpath('.//time').get_attribute('datetime') except NoSuchElementException: return comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text responding = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text text = comment + responding # add the both text fields together reply_cnt = card.find_element_by_xpath('.//div[#data-testid="reply"]').text retweet_cnt = card.find_element_by_xpath('.//div[#data-testid="retweet"]').text like_cnt = card.find_element_by_xpath('.//div[#data-testid="like"]').text tweet = (username, handle, postdate, text, reply_cnt, retweet_cnt, like_cnt) return tweet Command line arguments python twitter.py get_tweet_data(1)
So, this one took a while; but, I was able to get the information for you. When I went through Twitter's HTML, 6 different xpath calls were needed # Count of number of Tweets (//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet']) # First Card (//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])[1] # Twiter Card Likes, Retweets, Replies (//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])[1]//div[contains(#aria-label, 'likes')] # Twitter's Text Content (//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])[1]//div[#lang] # Twitter's DateTime (//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])[1]//time[#datetime] # Twitter href is the Twitter Account Poster ((//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']//div[#data-testId='tweet'])[1]//a[#role='link'])[1] Once I determined the proper xpath calls, I, then, created a class to store my data class Twitter_Info: """This class contains the information regarding to the Twitter Card""" CardNumber : int Likes : int Retweets : int Replies : int ContentInfo : str PostDate : str PosterAccount : str def print_info(self): print(f'Card Number: {self.CardNumber}') print(f'Poster Account: {self.PosterAccount}') print(f'Tweet Date: {self.PostDate}') print(f'Likes: {self.Likes}') print(f'Replies: {self.Replies}') print(f'Retweets: {self.Retweets}') print(f'Tweet Content: {self.ContentInfo}') Once this was accomplished, I added different methods to help accomplish the task at hand wait_for_tweets_to_load number_of_tweets_displayed scroll_to_card get_card_likes_retweets_replies get_card_text_content get_card_datetime get_card_poster_info Once these were determined, I was able to scroll to each card and scrape the data MAIN PROGRAM - For Reference from selenium import webdriver from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait as DriverWait from selenium.webdriver.support import expected_conditions as DriverConditions from selenium.common.exceptions import WebDriverException import time class Twitter_Info: """This class contains the information regarding to the Twitter Card""" CardNumber : int Likes : int Retweets : int Replies : int ContentInfo : str PostDate : str PosterAccount : str def print_info(self): print(f'Card Number: {self.CardNumber}') print(f'Poster Account: {self.PosterAccount}') print(f'Tweet Date: {self.PostDate}') print(f'Likes: {self.Likes}') print(f'Replies: {self.Replies}') print(f'Retweets: {self.Retweets}') print(f'Tweet Content: {self.ContentInfo}') def get_chrome_driver(): """This sets up our Chrome Driver and returns it as an object""" path_to_chrome = "F:\Selenium_Drivers\Windows_Chrome85_Driver\chromedriver.exe" chrome_options = webdriver.ChromeOptions() # Browser is displayed in a custom window size chrome_options.add_argument("window-size=1500,1000") return webdriver.Chrome(executable_path = path_to_chrome, options = chrome_options) def wait_displayed(driver : ChromeDriver, xpath: str, int = 5): try: DriverWait(driver, int).until( DriverConditions.presence_of_element_located(locator = (By.XPATH, xpath)) ) except: raise WebDriverException(f'Timeout: Failed to find {xpath}') def is_displayed(driver : ChromeDriver, xpath: str, int = 5): try: webElement = DriverWait(driver, int).until( DriverConditions.presence_of_element_located(locator = (By.XPATH, xpath)) ) return True if webElement != None else False except: return False def scroll_to_element(driver : ChromeDriver, xpath: str, int = 5): try: webElement = DriverWait(driver, int).until( DriverConditions.presence_of_element_located(locator = (By.XPATH, xpath)) ) driver.execute_script("arguments[0].scrollIntoView();", webElement) except: raise WebDriverException(f'Timeout: Failed to find {xpath}\nResult: Failed to Scroll') def wait_for_tweets_to_load(driver : ChromeDriver): if is_displayed(driver, "//main[#role='main']//div[#data-testid='primaryColumn']//div[contains(#aria-label, 'Loading Tweets')]"): for counter in range(10): if is_displayed(driver, "//main[#role='main']//div[#data-testid='primaryColumn']//div[contains(#aria-label, 'Loading Tweets')]") and counter == 9: raise Exception("Page Failed To Load Tweets") elif is_displayed(driver, "//main[#role='main']//div[#data-testid='primaryColumn']//div[contains(#aria-label, 'Loading Tweets')]") == False: break else: time.sleep(3) def number_of_tweets_displayed(driver : ChromeDriver): """Note: This number will change dynamically when we scroll down on the page ( new Tweets will start loading )""" xpath = "{0}{1}{2}".format("(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']", "//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]", "//article[#role='article']//div[#data-testId='tweet'])") return driver.find_elements(By.XPATH, xpath).__len__() def scroll_to_card(driver : ChromeDriver, card_number : int): xpath = "{0}{1}{2}".format("(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']", "//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]", "//article[#role='article']//div[#data-testId='tweet'])") scroll_to_element(driver, xpath = f'{xpath}[{card_number}]') def get_card_likes_retweets_replies(driver : ChromeDriver, card_number : int): xpath = "{0}{1}{2}".format("(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']", "//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]", "//article[#role='article']//div[#data-testId='tweet'])") xpath = f'{xpath}[{card_number}]//div[contains(#aria-label, "likes")]' return driver.find_element(By.XPATH, xpath).get_attribute('aria-label').split(',') def get_card_text_content(driver : ChromeDriver, card_number : int): xpath = "{0}{1}{2}".format("(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']", "//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]", "//article[#role='article']//div[#data-testId='tweet'])") xpath = f'{xpath}[{card_number}]//div[#lang]' return driver.find_element(By.XPATH, xpath).text def get_card_datetime(driver : ChromeDriver, card_number : int): xpath = "{0}{1}{2}".format("(//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']", "//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]", "//article[#role='article']//div[#data-testId='tweet'])") xpath = f'{xpath}[{card_number}]//time[#datetime]' return driver.find_element(By.XPATH, xpath).get_attribute('datetime') def get_card_poster_info(driver : ChromeDriver, card_number : int): xpath = "{0}{1}{2}".format("((//main[#role='main']//div[#data-testid='primaryColumn']//section[#aria-labelledby='accessible-list-0']", "//div[contains(#aria-label, 'Timeline:')]//div[contains(#style, 'position: absolute; width: 100%;')]//article[#role='article']", "//div[#data-testId='tweet'])") xpath = f'{xpath}[{card_number}]//a[#role="link"])[1]' return driver.find_element(By.XPATH, xpath).get_attribute('href') # Gets our chrome driver and opens our site chrome_driver = get_chrome_driver() chrome_driver.get("https://twitter.com/bbc") wait_displayed(chrome_driver, "//div[#data-testid='placementTracking']//div[#role='button']//span[text()='Follow']") wait_displayed(chrome_driver, "//section[#aria-label='Sign up']") wait_displayed(chrome_driver, "//aside[#aria-label='Who to follow']") wait_for_tweets_to_load(chrome_driver) # Get number of Tweets that are displayed numberOfTweetsDisplayed = number_of_tweets_displayed(chrome_driver) twitter_cards = [] # Scrape Card Information for cards in range(numberOfTweetsDisplayed): scroll_to_card(chrome_driver, (cards + 1)) twitter_card = Twitter_Info() twitter_card.CardNumber = cards + 1 # Get the Like | Retweet | Replies Info raw_info = get_card_likes_retweets_replies(chrome_driver, (cards + 1)) twitter_card.Replies = raw_info[0].strip().split(' ')[0] twitter_card.Retweets = raw_info[1].strip().split(' ')[0] twitter_card.Likes = raw_info[2].strip().split(' ')[0] # Get rest of our data twitter_card.ContentInfo = get_card_text_content(chrome_driver, (cards + 1)) twitter_card.PostDate = get_card_datetime(chrome_driver, (cards + 1)) twitter_card.PosterAccount = get_card_poster_info(chrome_driver, (cards + 1)) # Display our information and add it to our list twitter_card.print_info() twitter_cards.append(twitter_card) print(f'Added Card Number {(cards + 1)} successfully') print('========================================================\n') # Print how many twitter cards were scraped print(f'Twitter Cards Added: {twitter_cards.__len__()}') chrome_driver.quit() chrome_driver.service.stop() SAMPLE OUTPUT Card Number: 1 Poster Account: https://twitter.com/BBC Tweet Date: 2020-06-22T11:22:53.000Z Likes: 1106 Replies: 2827 Retweets: 841 Tweet Content: We’ve always been here to celebrate diversity. But we need to do more, and we will. This is our commitment to long-term change. #RightTheScript Read more about our £100m commitment here: https://bbc.in/37OPMLv Added Card Number 1 successfully ======================================================== Card Number: 2 Poster Account: https://twitter.com/BBC Tweet Date: 2020-11-16T17:01:00.000Z Likes: 100 Replies: 10 Retweets: 36 Tweet Content: More than 100 intact sarcophagi, dating back 2,500 years, have been unearthed near Cairo. Added Card Number 2 successfully ======================================================== Card Number: 3 Poster Account: https://twitter.com/BBC Tweet Date: 2020-11-15T16:01:00.000Z Likes: 68 Replies: 5 Retweets: 16 Tweet Content: With Cornish wildlife facing so many threats from humans, these residents do whatever they can to help #Cornwall with #simon_reeve | 8:10pm | #bbctwo & #bbciplayer . Added Card Number 3 successfully ======================================================== Card Number: 4 Poster Account: https://twitter.com/bbcasiannetwork Tweet Date: 2020-11-14T09:44:41.000Z Likes: 133 Replies: 7 Retweets: 33 Tweet Content: Happy Diwali and Bandi Chhor Divas! Added Card Number 4 successfully ======================================================== Card Number: 5 Poster Account: https://twitter.com/BBC Tweet Date: 2020-11-13T22:18:26.000Z Likes: 443 Replies: 13 Retweets: 86 Tweet Content: It's the clash of the tennis titans #Andy_Murray and... er, #petercrouch ? #ChildrenInNeed Added Card Number 5 successfully ======================================================== Card Number: 6 Poster Account: https://twitter.com/BBC Tweet Date: 2020-11-13T20:57:23.000Z Likes: 426 Replies: 25 Retweets: 109 Tweet Content: The official video for this year's star-studded #bbccin single, 'Stop Crying Your Heart Out' is here! Watch now and don't forget to download the song to support #ChildrenInNeed https://bbc.in/32I60EZ Added Card Number 6 successfully ======================================================== Card Number: 7 Poster Account: https://twitter.com/BBC Tweet Date: 2020-11-13T15:37:06.000Z Likes: 18 Replies: 7 Retweets: 7 Tweet Content: It's time for #ChildrenInNeed 2020! Starting RIGHT NOW on #BBCOne & #BBCiPlayer http://bbc.in/3kuv1cG Added Card Number 7 successfully ======================================================== Twitter Cards Added: 7
Python sorting html table
I am looping through a list of servers and connecting with OpenSSL, to retrieve the SSL cert, and grabbing the server name, the date the cert expires, and calculating the number of days until cert expires. I am then building an html table with the data. The columns are Host, Hostname, Expiration Date, and Remaining Days. What is the best way to sort the table by the "Remaining Days" column? # Update the hosts entry ssl_results[str(ip)][0] = host ssl_results[str(ip)][1] = server_name ssl_results[str(ip)][2] = exp_date ssl_results[str(ip)][3] = days_to_expire # Loop through the ssl_results entries and generate a email + results file try: # variable to hold html for email SSLCertificates = """<html> <head> <style> table{width: 1024px;} table, th, td { border: 1px solid black; border-collapse: collapse; } th, td { padding: 5px; text-align: left; } ul:before{ content:attr(data-header); font-size:120%; font-weight:bold; margin-left:-15px; } </style> </head> <body> <p><h2>Blah, </h2> <h3>SSL Expiration Summary:</h3> <span style="color:red;"><b>Blah Blah Blah.<b></span><br><br> <table id=\"exp_ssls\"><tr><th>Host</th><th>Hostname</th><th>Expiration Date</th><th>Remaining Days</th></tr> """ for entries in ssl_results: SSLCertificates += "<tr><td>" + str(entries) + "</td><td>" + str(ssl_results[entries][1]) + "</td><td>" + str( ssl_results[entries][2]) + "</td><td>" + str(ssl_results[entries][3]) + "</td></tr>" SSLCertificates += """</body> </html>""" f = open('SSLCertificates.html', 'w') f.write(SSLCertificates) f.close() filename = 'SSLCertificates.html' attachment = open(filename, 'rb')
Sort the dict before you form the html tags. then Iterate thru the dict and print it using html tags. Use sorted() to sort your dict before you iterate thru it. import operator x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0} sorted_x = sorted(x.items(), key=operator.itemgetter(1)) sorted_x will be a list of tuples sorted by the second element in each tuple. dict(sorted_x) == x.
Selenium doesn't find elements
I'm coding in python with selenium webdriver to automate some stuff, but selenium's find_element_* methods don't work. These are the webpages I am trying: https://campus.webex.com/mw3300/mywebex/default.do?siteurl=campus&service=10 and https://webmail.aruba.it/cgi-bin/ajaxmail and http://campus.istitutovolta.eu/index.php I tried all strategies: by_class, by_link, by_name, by_id, ecc.... nothing!! I use gecko and firefox. Here is the code: from selenium import webdriver browser = webdriver.Firefox() try: browser.get("https://campus.webex.com/mw3100/mywebex/default.do?siteurl=campus&service=7") except: print("pagina non trovata") try: utente=browser.find_element_by_name("userName") except: print("elemento non trovato") else: print(utente) utente.send_keys('user#gestione.eu') try: psw=browser.find_element_by_name("PASSWD") except: print("elemento non trovato") else: print(psw) psw.send_keys('123456')
UPDATE 2 + 3: I GOT IT WORK: CHECK IT OUT: Variant 1: from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC browser = webdriver.Firefox() delay = 100 # seconds # browser.get("https://campus.webex.com/mw3100/mywebex/default.do?siteurl=campus") browser.get("https://campus.webex.com/mw3300/mywebex/login/login.do?siteurl=campus&login_return_url=%2Ftc3300%2Ftrainingcenter%2Fsite%2FinstantSession.do%3Fsiteurl%3Dcampus") myelem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, 'mwx-ipt-username'))) utente = browser.find_element_by_id("mwx-ipt-username").send_keys('user#gestione.eu') Variant 2: from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC browser = webdriver.Firefox() delay = 20 # seconds browser.get("https://campus.webex.com/mw3100/mywebex/default.do?siteurl=campus") browser.switch_to.frame("mainFrame") browser.switch_to.frame("main") myelem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, 'mwx-ipt-username'))) utente = browser.find_element_by_id("mwx-ipt-username").send_keys('user#gestione.eu') The problem was the frameset in the site. In parsed selenium html code, the content of the frames was missing, so synonymous, the login form was not found. This is remedied by a) variant 1: You directly open the appropriate frame. b) variant 2: you switch from the original page into the frame. see: How to navigate a subframe inside a frameset using Selenium WebDriver with Python? function for switching frames in python, selenium How to identify and switch to the frame in selenium webdriver when frame does not have id How to navigate a subframe inside a frameset using Selenium WebDriver? UPDATE: I will try to find a solution trough the buttons the website provides. Give me some time. I clicked on your link and checked to "userName" and there was no "userName". It's the same, if you open a new tab in your normal firefox, and type "view-source:https://campus.webex.com/mw3300/mywebex/default.do?siteurl=campus" in it. I tried this: from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC browser = webdriver.Firefox() delay = 30 # seconds try: browser.get("https://campus.webex.com/mw3100/mywebex/default.do?siteurl=campus&service=7") except: print("pagina non trovata") myelem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, 'mwx-ipt-username'))) try: utente = browser.find_element_by_id("mwx-ipt-username") except: print("elemento non trovato") It doesn't work. The HTML i always get: <!DOCTYPE html> <html lang="it-IT"> <HEAD> <meta http-equiv="X-UA-Compatible" content="IE=Edge"> <meta http-equiv="content-type" content="text/html; charset=UTF-8"> <meta name='format-detection' content='telephone=no'> <meta name='slack-app-id' content='A5P5FDK33'> <meta name="description" content="5"><link rel="shortcut icon" href="/favicont29.ico" type="image/x-icon"> <script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={xpid:"XQUDUldSGwUCXFdWAAgF"};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var o=e[n]={exports:{}};t[n][0].call(o.exports,function(e){var o=t[n][1][e];return r(o||e)},o,o.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<n.length;o++)r(n[o]);return r}({1:[function(t,e,n){function r(t){try{s.console&&console.log(t)}catch(e){}}var o,i=t("ee"),a=t(21),s={};try{o=localStorage.getItem("__nr_flags").split(","),console&&"function"==typeof console.log&&(s.console=!0,o.indexOf("dev")!==-1&&(s.dev=!0),o.indexOf("nr_dev")!==-1&&(s.nrDev=!0))}catch(c){}s.nrDev&&i.on("internal-error",function(t){r(t.stack)}),s.dev&&i.on("fn-err",function(t,e,n){r(n.stack)}),s.dev&&(r("NR AGENT IN DEVELOPMENT MODE"),r("flags: "+a(s,function(t,e){return t}).join(", ")))},{}],2:[function(t,e,n){function r(t,e,n,r,s){try{l?l-=1:o(s||new UncaughtException(t,e,n),!0)}catch(f){try{i("ierr",[f,c.now(),!0])}catch(d){}}return"function"==typeof u&&u.apply(this,a(arguments))}function UncaughtException(t,e,n){this.message=t||"Uncaught error with no additional information",this.sourceURL=e,this.line=n}function o(t,e){var n=e?null:c.now();i("err",[t,n])}var i=t("handle"),a=t(22),s=t("ee"),c=t("loader"),f=t("gos"),u=window.onerror,d=!1,p="nr#seenError",l=0;c.features.err=!0,t(1),window.onerror=r;try{throw new Error}catch(h){"stack"in h&&(t(13),t(12),"addEventListener"in window&&t(6),c.xhrWrappable&&t(14),d=!0)}s.on("fn-start",function(t,e,n){d&&(l+=1)}),s.on("fn-err",function(t,e,n){d&&!n[p]&&(f(n,p,function(){return!0}),this.thrown=!0,o(n))}),s.on("fn-end",function(){d&&!this.thrown&&l>0&&(l-=1)}),s.on("internal-error",function(t){i("ierr",[t,c.now(),!0])})},{}],3:[function(t,e,n){t("loader").features.ins=!0},{}],4:[function(t,e,n){function r(){M++,N=y.hash,this[u]=g.now()}function o(){M--,y.hash!==N&&i(0,!0);var t=g.now();this[h]=~~this[h]+t-this[u],this[d]=t}function i(t,e){E.emit("newURL",[""+y,e])}function a(t,e){t.on(e,function(){this[e]=g.now()})}var s="-start",c="-end",f="-body",u="fn"+s,d="fn"+c,p="cb"+s,l="cb"+c,h="jsTime",m="fetch",v="addEventListener",w=window,y=w.location,g=t("loader");if(w[v]&&g.xhrWrappable){var b=t(10),x=t(11),E=t(8),O=t(6),P=t(13),R=t(7),T=t(14),L=t(9),j=t("ee"),S=j.get("tracer");t(15),g.features.spa=!0;var N,M=0;j.on(u,r),j.on(p,r),j.on(d,o),j.on(l,o),j.buffer([u,d,"xhr-done","xhr-resolved"]),O.buffer([u]),P.buffer(["setTimeout"+c,"clearTimeout"+s,u]),T.buffer([u,"new-xhr","send-xhr"+s]),R.buffer([m+s,m+"-done",m+f+s,m+f+c]),E.buffer(["newURL"]),b.buffer([u]),x.buffer(["propagate",p,l,"executor-err","resolve"+s]),S.buffer([u,"no-"+u]),L.buffer(["new-jsonp","cb-start","jsonp-error","jsonp-end"]),a(T,"send-xhr"+s),a(j,"xhr-resolved"),a(j,"xhr-done"),a(R,m+s),a(R,m+"-done"),a(L,"new-jsonp"),a(L,"jsonp-end"),a(L,"cb-start"),E.on("pushState-end",i),E.on("replaceState-end",i),w[v]("hashchange",i,!0),w[v]("load",i,!0),w[v]("popstate",function(){i(0,M>1)},!0)}},{}],5:[function(t,e,n){function r(t){}if(window.performance&&window.performance.timing&&window.performance.getEntriesByType){var o=t("ee"),i=t("handle"),a=t(13),s=t(12),c="learResourceTimings",f="addEventListener",u="resourcetimingbufferfull",d="bstResource",p="resource",l="-start",h="-end",m="fn"+l,v="fn"+h,w="bstTimer",y="pushState",g=t("loader");g.features.stn=!0,t(8);var b=NREUM.o.EV;o.on(m,function(t,e){var n=t[0];n instanceof b&&(this.bstStart=g.now())}),o.on(v,function(t,e){var n=t[0];n instanceof b&&i("bst",[n,e,this.bstStart,g.now()])}),a.on(m,function(t,e,n){this.bstStart=g.now(),this.bstType=n}),a.on(v,function(t,e){i(w,[e,this.bstStart,g.now(),this.bstType])}),s.on(m,function(){this.bstStart=g.now()}),s.on(v,function(t,e){i(w,[e,this.bstStart,g.now(),"requestAnimationFrame"])}),o.on(y+l,function(t){this.time=g.now(),this.startPath=location.pathname+location.hash}),o.on(y+h,function(t){i("bstHist",[location.pathname+location.hash,this.startPath,this.time])}),f in window.performance&&(window.performance["c"+c]?window.performance[f](u,function(t){i(d,[window.performance.getEntriesByType(p)]),window.performance["c"+c]()},!1):window.performance[f]("webkit"+u,function(t){i(d,[window.performance.getEntriesByType(p)]),window.performance["webkitC"+c]()},!1)),document[f]("scroll",r,{passive:!0}),document[f]("keypress",r,!1),document[f]("click",r,!1)}},{}],6:[function(t,e,n){function r(t){for(var e=t;e&&!e.hasOwnProperty(u);)e=Object.getPrototypeOf(e);e&&o(e)}function o(t){s.inPlace(t,[u,d],"-",i)}function i(t,e){return t[1]}var a=t("ee").get("events"),s=t(24)(a,!0),c=t("gos"),f=XMLHttpRequest,u="addEventListener",d="removeEventListener";e.exports=a,"getPrototypeOf"in Object?(r(document),r(window),r(f.prototype)):f.prototype.hasOwnProperty(u)&&(o(window),o(f.prototype)),a.on(u+"-start",function(t,e){var n=t[1],r=c(n,"nr#wrapped",function(){function t(){if("function"==typeof n.handleEvent)return n.handleEvent.apply(n,arguments)}var e={object:t,"function":n}[typeof n];return e?s(e,"fn-",null,e.name||"anonymous"):n});this.wrapped=t[1]=r}),a.on(d+"-start",function(t){t[1]=this.wrapped||t[1]})},{}],7:[function(t,e,n){function r(t,e,n){var r=t[e];"function"==typeof r&&(t[e]=function(){var t=r.apply(this,arguments);return o.emit(n+"start",arguments,t),t.then(function(e){return o.emit(n+"end",[null,e],t),e},function(e){throw o.emit(n+"end",[e],t),e})})}var o=t("ee").get("fetch"),i=t(21);e.exports=o;var a=window,s="fetch-",c=s+"body-",f=["arrayBuffer","blob","json","text","formData"],u=a.Request,d=a.Response,p=a.fetch,l="prototype";u&&d&&p&&(i(f,function(t,e){r(u[l],e,c),r(d[l],e,c)}),r(a,"fetch",s),o.on(s+"end",function(t,e){var n=this;if(e){var r=e.headers.get("content-length");null!==r&&(n.rxSize=r),o.emit(s+"done",[null,e],n)}else o.emit(s+"done",[t],n)}))},{}],8:[function(t,e,n){var r=t("ee").get("history"),o=t(24)(r);e.exports=r,o.inPlace(window.history,["pushState","replaceState"],"-")},{}],9:[function(t,e,n){function r(t){function e(){c.emit("jsonp-end",[],p),t.removeEventListener("load",e,!1),t.removeEventListener("error",n,!1)}function n(){c.emit("jsonp-error",[],p),c.emit("jsonp-end",[],p),t.removeEventListener("load",e,!1),t.removeEventListener("error",n,!1)}var r=t&&"string"==typeof t.nodeName&&"script"===t.nodeName.toLowerCase();if(r){var o="function"==typeof t.addEventListener;if(o){var a=i(t.src);if(a){var u=s(a),d="function"==typeof u.parent[u.key];if(d){var p={};f.inPlace(u.parent,[u.key],"cb-",p),t.addEventListener("load",e,!1),t.addEventListener("error",n,!1),c.emit("new-jsonp",[t.src],p)}}}}}function o(){return"addEventListener"in window}function i(t){var e=t.match(u);return e?e[1]:null}function a(t,e){var n=t.match(p),r=n[1],o=n[3];return o?a(o,e[r]):e[r]}function s(t){var e=t.match(d);return e&&e.length>=3?{key:e[2],parent:a(e[1],window)}:{key:t,parent:window}}var c=t("ee").get("jsonp"),f=t(24)(c);if(e.exports=c,o()){var u=/[?&](?:callback|cb)=([^&#]+)/,d=/(.*)\.([^.]+)/,p=/^(\w+)(\.|$)(.*)$/,l=["appendChild","insertBefore","replaceChild"];f.inPlace(HTMLElement.prototype,l,"dom-"),f.inPlace(HTMLHeadElement.prototype,l,"dom-"),f.inPlace(HTMLBodyElement.prototype,l,"dom-"),c.on("dom-start",function(t){r(t[0])})}},{}],10:[function(t,e,n){var r=t("ee").get("mutation"),o=t(24)(r),i=NREUM.o.MO;e.exports=r,i&&(window.MutationObserver=function(t){return this instanceof i?new i(o(t,"fn-")):i.apply(this,arguments)},MutationObserver.prototype=i.prototype)},{}],11:[function(t,e,n){function r(t){var e=a.context(),n=s(t,"executor-",e),r=new f(n);return a.context(r).getCtx=function(){return e},a.emit("new-promise",[r,e],e),r}function o(t,e){return e}var i=t(24),a=t("ee").get("promise"),s=i(a),c=t(21),f=NREUM.o.PR;e.exports=a,f&&(window.Promise=r,["all","race"].forEach(function(t){var e=f[t];f[t]=function(n){function r(t){return function(){a.emit("propagate",[null,!o],i),o=o||!t}}var o=!1;c(n,function(e,n){Promise.resolve(n).then(r("all"===t),r(!1))});var i=e.apply(f,arguments),s=f.resolve(i);return s}}),["resolve","reject"].forEach(function(t){var e=f[t];f[t]=function(t){var n=e.apply(f,arguments);return t!==n&&a.emit("propagate",[t,!0],n),n}}),f.prototype["catch"]=function(t){return this.then(null,t)},f.prototype=Object.create(f.prototype,{constructor:{value:r}}),c(Object.getOwnPropertyNames(f),function(t,e){try{r[e]=f[e]}catch(n){}}),a.on("executor-start",function(t){t[0]=s(t[0],"resolve-",this),t[1]=s(t[1],"resolve-",this)}),a.on("executor-err",function(t,e,n){t[1](n)}),s.inPlace(f.prototype,["then"],"then-",o),a.on("then-start",function(t,e){this.promise=e,t[0]=s(t[0],"cb-",this),t[1]=s(t[1],"cb-",this)}),a.on("then-end",function(t,e,n){this.nextPromise=n;var r=this.promise;a.emit("propagate",[r,!0],n)}),a.on("cb-end",function(t,e,n){a.emit("propagate",[n,!0],this.nextPromise)}),a.on("propagate",function(t,e,n){this.getCtx&&!e||(this.getCtx=function(){if(t instanceof Promise)var e=a.context(t);return e&&e.getCtx?e.getCtx():this})}),r.toString=function(){return""+f})},{}],12:[function(t,e,n){var r=t("ee").get("raf"),o=t(24)(r),i="equestAnimationFrame";e.exports=r,o.inPlace(window,["r"+i,"mozR"+i,"webkitR"+i,"msR"+i],"raf-"),r.on("raf-start",function(t){t[0]=o(t[0],"fn-")})},{}],13:[function(t,e,n){function r(t,e,n){t[0]=a(t[0],"fn-",null,n)}function o(t,e,n){this.method=n,this.timerDuration=isNaN(t[1])?0:+t[1],t[0]=a(t[0],"fn-",this,n)}var i=t("ee").get("timer"),a=t(24)(i),s="setTimeout",c="setInterval",f="clearTimeout",u="-start",d="-";e.exports=i,a.inPlace(window,[s,"setImmediate"],s+d),a.inPlace(window,[c],c+d),a.inPlace(window,[f,"clearImmediate"],f+d),i.on(c+u,r),i.on(s+u,o)},{}],14:[function(t,e,n){function r(t,e){d.inPlace(e,["onreadystatechange"],"fn-",s)}function o(){var t=this,e=u.context(t);t.readyState>3&&!e.resolved&&(e.resolved=!0,u.emit("xhr-resolved",[],t)),d.inPlace(t,y,"fn-",s)}function i(t){g.push(t),h&&(x?x.then(a):v?v(a):(E=-E,O.data=E))}function a(){for(var t=0;t<g.length;t++)r([],g[t]);g.length&&(g=[])}function s(t,e){return e}function c(t,e){for(var n in t)e[n]=t[n];return e}t(6);var f=t("ee"),u=f.get("xhr"),d=t(24)(u),p=NREUM.o,l=p.XHR,h=p.MO,m=p.PR,v=p.SI,w="readystatechange",y=["onload","onerror","onabort","onloadstart","onloadend","onprogress","ontimeout"],g=[];e.exports=u;var b=window.XMLHttpRequest=function(t){var e=new l(t);try{u.emit("new-xhr",[e],e),e.addEventListener(w,o,!1)}catch(n){try{u.emit("internal-error",[n])}catch(r){}}return e};if(c(l,b),b.prototype=l.prototype,d.inPlace(b.prototype,["open","send"],"-xhr-",s),u.on("send-xhr-start",function(t,e){r(t,e),i(e)}),u.on("open-xhr-start",r),h){var x=m&&m.resolve();if(!v&&!m){var E=1,O=document.createTextNode(E);new h(a).observe(O,{characterData:!0})}}else f.on("fn-end",function(t){t[0]&&t[0].type===w||a()})},{}],15:[function(t,e,n){function r(t){var e=this.params,n=this.metrics;if(!this.ended){this.ended=!0;for(var r=0;r<d;r++)t.removeEventListener(u[r],this.listener,!1);if(!e.aborted){if(n.duration=a.now()-this.startTime,4===t.readyState){e.status=t.status;var i=o(t,this.lastSize);if(i&&(n.rxSize=i),this.sameOrigin){var c=t.getResponseHeader("X-NewRelic-App-Data");c&&(e.cat=c.split(", ").pop())}}else e.status=0;n.cbTime=this.cbTime,f.emit("xhr-done",[t],t),s("xhr",[e,n,this.startTime])}}}function o(t,e){var n=t.responseType;if("json"===n&&null!==e)return e;var r="arraybuffer"===n||"blob"===n||"json"===n?t.response:t.responseText;return h(r)}function i(t,e){var n=c(e),r=t.params;r.host=n.hostname+":"+n.port,r.pathname=n.pathname,t.sameOrigin=n.sameOrigin}var a=t("loader");if(a.xhrWrappable){var s=t("handle"),c=t(16),f=t("ee"),u=["load","error","abort","timeout"],d=u.length,p=t("id"),l=t(19),h=t(18),m=window.XMLHttpRequest;a.features.xhr=!0,t(14),f.on("new-xhr",function(t){var e=this;e.totalCbs=0,e.called=0,e.cbTime=0,e.end=r,e.ended=!1,e.xhrGuids={},e.lastSize=null,l&&(l>34||l<10)||window.opera||t.addEventListener("progress",function(t){e.lastSize=t.loaded},!1)}),f.on("open-xhr-start",function(t){this.params={method:t[0]},i(this,t[1]),this.metrics={}}),f.on("open-xhr-end",function(t,e){"loader_config"in NREUM&&"xpid"in NREUM.loader_config&&this.sameOrigin&&e.setRequestHeader("X-NewRelic-ID",NREUM.loader_config.xpid)}),f.on("send-xhr-start",function(t,e){var n=this.metrics,r=t[0],o=this;if(n&&r){var i=h(r);i&&(n.txSize=i)}this.startTime=a.now(),this.listener=function(t){try{"abort"===t.type&&(o.params.aborted=!0),("load"!==t.type||o.called===o.totalCbs&&(o.onloadCalled||"function"!=typeof e.onload))&&o.end(e)}catch(n){try{f.emit("internal-error",[n])}catch(r){}}};for(var s=0;s<d;s++)e.addEventListener(u[s],this.listener,!1)}),f.on("xhr-cb-time",function(t,e,n){this.cbTime+=t,e?this.onloadCalled=!0:this.called+=1,this.called!==this.totalCbs||!this.onloadCalled&&"function"==typeof n.onload||this.end(n)}),f.on("xhr-load-added",function(t,e){var n=""+p(t)+!!e;this.xhrGuids&&!this.xhrGuids[n]&&(this.xhrGuids[n]=!0,this.totalCbs+=1)}),f.on("xhr-load-removed",function(t,e){var n=""+p(t)+!!e;this.xhrGuids&&this.xhrGuids[n]&&(delete this.xhrGuids[n],this.totalCbs-=1)}),f.on("addEventListener-end",function(t,e){e instanceof m&&"load"===t[0]&&f.emit("xhr-load-added",[t[1],t[2]],e)}),f.on("removeEventListener-end",function(t,e){e instanceof m&&"load"===t[0]&&f.emit("xhr-load-removed",[t[1],t[2]],e)}),f.on("fn-start",function(t,e,n){e instanceof m&&("onload"===n&&(this.onload=!0),("load"===(t[0]&&t[0].type)||this.onload)&&(this.xhrCbStart=a.now()))}),f.on("fn-end",function(t,e){this.xhrCbStart&&f.emit("xhr-cb-time",[a.now()-this.xhrCbStart,this.onload,e],e)})}},{}],16:[function(t,e,n){e.exports=function(t){var e=document.createElement("a"),n=window.location,r={};e.href=t,r.port=e.port;var o=e.href.split("://");!r.port&&o[1]&&(r.port=o[1].split("/")[0].split("#").pop().split(":")[1]),r.port&&"0"!==r.port||(r.port="https"===o[0]?"443":"80"),r.hostname=e.hostname||n.hostname,r.pathname=e.pathname,r.protocol=o[0],"/"!==r.pathname.charAt(0)&&(r.pathname="/"+r.pathname);var i=!e.protocol||":"===e.protocol||e.protocol===n.protocol,a=e.hostname===document.domain&&e.port===n.port;return r.sameOrigin=i&&(!e.hostname||a),r}},{}],17:[function(t,e,n){function r(){}function o(t,e,n){return function(){return i(t,[f.now()].concat(s(arguments)),e?null:this,n),e?void 0:this}}var i=t("handle"),a=t(21),s=t(22),c=t("ee").get("tracer"),f=t("loader"),u=NREUM;"undefined"==typeof window.newrelic&&(newrelic=u);var d=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTrace","inlineHit","addRelease"],p="api-",l=p+"ixn-";a(d,function(t,e){u[e]=o(p+e,!0,"api")}),u.addPageAction=o(p+"addPageAction",!0),u.setCurrentRouteName=o(p+"routeName",!0),e.exports=newrelic,u.interaction=function(){return(new r).get()};var h=r.prototype={createTracer:function(t,e){var n={},r=this,o="function"==typeof e;return i(l+"tracer",[f.now(),t,n],r),function(){if(c.emit((o?"":"no-")+"fn-start",[f.now(),r,o],n),o)try{return e.apply(this,arguments)}catch(t){throw c.emit("fn-err",[arguments,this,t],n),t}finally{c.emit("fn-end",[f.now()],n)}}}};a("actionText,setName,setAttribute,save,ignore,onEnd,getContext,end,get".split(","),function(t,e){h[e]=o(l+e)}),newrelic.noticeError=function(t){"string"==typeof t&&(t=new Error(t)),i("err",[t,f.now()])}},{}],18:[function(t,e,n){e.exports=function(t){if("string"==typeof t&&t.length)return t.length;if("object"==typeof t){if("undefined"!=typeof ArrayBuffer&&t instanceof ArrayBuffer&&t.byteLength)return t.byteLength;if("undefined"!=typeof Blob&&t instanceof Blob&&t.size)return t.size;if(!("undefined"!=typeof FormData&&t instanceof FormData))try{return JSON.stringify(t).length}catch(e){return}}}},{}],19:[function(t,e,n){var r=0,o=navigator.userAgent.match(/Firefox[\/\s](\d+\.\d+)/);o&&(r=+o[1]),e.exports=r},{}],20:[function(t,e,n){function r(t,e){if(!o)return!1;if(t!==o)return!1;if(!e)return!0;if(!i)return!1;for(var n=i.split("."),r=e.split("."),a=0;a<r.length;a++)if(r[a]!==n[a])return!1;return!0}var o=null,i=null,a=/Version\/(\S+)\s+Safari/;if(navigator.userAgent){var s=navigator.userAgent,c=s.match(a);c&&s.indexOf("Chrome")===-1&&s.indexOf("Chromium")===-1&&(o="Safari",i=c[1])}e.exports={agent:o,version:i,match:r}},{}],21:[function(t,e,n){function r(t,e){var n=[],r="",i=0;for(r in t)o.call(t,r)&&(n[i]=e(r,t[r]),i+=1);return n}var o=Object.prototype.hasOwnProperty;e.exports=r},{}],22:[function(t,e,n){function r(t,e,n){e||(e=0),"undefined"==typeof n&&(n=t?t.length:0);for(var r=-1,o=n-e||0,i=Array(o<0?0:o);++r<o;)i[r]=t[e+r];return i}e.exports=r},{}],23:[function(t,e,n){e.exports={exists:"undefined"!=typeof window.performance&&window.performance.timing&&"undefined"!=typeof window.performance.timing.navigationStart}},{}],24:[function(t,e,n){function r(t){return!(t&&t instanceof Function&&t.apply&&!t[a])}var o=t("ee"),i=t(22),a="nr#original",s=Object.prototype.hasOwnProperty,c=!1;e.exports=function(t,e){function n(t,e,n,o){function nrWrapper(){var r,a,s,c;try{a=this,r=i(arguments),s="function"==typeof n?n(r,a):n||{}}catch(f){p([f,"",[r,a,o],s])}u(e+"start",[r,a,o],s);try{return c=t.apply(a,r)}catch(d){throw u(e+"err",[r,a,d],s),d}finally{u(e+"end",[r,a,c],s)}}return r(t)?t:(e||(e=""),nrWrapper[a]=t,d(t,nrWrapper),nrWrapper)}function f(t,e,o,i){o||(o="");var a,s,c,f="-"===o.charAt(0);for(c=0;c<e.length;c++)s=e[c],a=t[s],r(a)||(t[s]=n(a,f?s+o:o,i,s))}function u(n,r,o){if(!c||e){var i=c;c=!0;try{t.emit(n,r,o,e)}catch(a){p([a,n,r,o])}c=i}}function d(t,e){if(Object.defineProperty&&Object.keys)try{var n=Object.keys(t);return n.forEach(function(n){Object.defineProperty(e,n,{get:function(){return t[n]},set:function(e){return t[n]=e,e}})}),e}catch(r){p([r])}for(var o in t)s.call(t,o)&&(e[o]=t[o]);return e}function p(e){try{t.emit("internal-error",e)}catch(n){}}return t||(t=o),n.inPlace=f,n.flag=a,n}},{}],ee:[function(t,e,n){function r(){}function o(t){function e(t){return t&&t instanceof r?t:t?c(t,s,i):i()}function n(n,r,o,i){if(!p.aborted||i){t&&t(n,r,o);for(var a=e(o),s=m(n),c=s.length,f=0;f<c;f++)s[f].apply(a,r);var d=u[g[n]];return d&&d.push([b,n,r,a]),a}}function l(t,e){y[t]=m(t).concat(e)}function h(t,e){var n=y[t];if(n)for(var r=0;r<n.length;r++)n[r]===e&&n.splice(r,1)}function m(t){return y[t]||[]}function v(t){return d[t]=d[t]||o(n)}function w(t,e){f(t,function(t,n){e=e||"feature",g[n]=e,e in u||(u[e]=[])})}var y={},g={},b={on:l,addEventListener:l,removeEventListener:h,emit:n,get:v,listeners:m,context:e,buffer:w,abort:a,aborted:!1};return b}function i(){return new r}function a(){(u.api||u.feature)&&(p.aborted=!0,u=p.backlog={})}var s="nr#context",c=t("gos"),f=t(21),u={},d={},p=e.exports=o();p.backlog=u},{}],gos:[function(t,e,n){function r(t,e,n){if(o.call(t,e))return t[e];var r=n();if(Object.defineProperty&&Object.keys)try{return Object.defineProperty(t,e,{value:r,writable:!0,enumerable:!1}),r}catch(i){}return t[e]=r,r}var o=Object.prototype.hasOwnProperty;e.exports=r},{}],handle:[function(t,e,n){function r(t,e,n,r){o.buffer([t],r),o.emit(t,e,n)}var o=t("ee").get("handle");e.exports=r,r.ee=o},{}],id:[function(t,e,n){function r(t){var e=typeof t;return!t||"object"!==e&&"function"!==e?-1:t===window?0:a(t,i,function(){return o++})}var o=1,i="nr#id",a=t("gos");e.exports=r},{}],loader:[function(t,e,n){function r(){if(!E++){var t=x.info=NREUM.info,e=l.getElementsByTagName("script")[0];if(setTimeout(u.abort,3e4),!(t&&t.licenseKey&&t.applicationID&&e))return u.abort();f(g,function(e,n){t[e]||(t[e]=n)}),c("mark",["onload",a()+x.offset],null,"api");var n=l.createElement("script");n.src="https://"+t.agent,e.parentNode.insertBefore(n,e)}}function o(){"complete"===l.readyState&&i()}function i(){c("mark",["domContent",a()+x.offset],null,"api")}function a(){return O.exists&&performance.now?Math.round(performance.now()):(s=Math.max((new Date).getTime(),s))-x.offset}var s=(new Date).getTime(),c=t("handle"),f=t(21),u=t("ee"),d=t(20),p=window,l=p.document,h="addEventListener",m="attachEvent",v=p.XMLHttpRequest,w=v&&v.prototype;NREUM.o={ST:setTimeout,SI:p.setImmediate,CT:clearTimeout,XHR:v,REQ:p.Request,EV:p.Event,PR:p.Promise,MO:p.MutationObserver};var y=""+location,g={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net",agent:"js-agent.newrelic.com/nr-spa-1099.min.js"},b=v&&w&&w[h]&&!/CriOS/.test(navigator.userAgent),x=e.exports={offset:s,now:a,origin:y,features:{},xhrWrappable:b,userAgent:d};t(17),l[h]?(l[h]("DOMContentLoaded",i,!1),p[h]("load",r,!1)):(l[m]("onreadystatechange",o),p[m]("onload",r)),c("mark",["firstbyte",s],null,"api");var E=0,O=t(23)},{}]},{},["loader",2,15,5,3,4]);</script><TITLE>Sito Webex Enterprise</TITLE> <meta http-equiv="Pragma" content="no-cache"> <script language="JavaScript"> function setCookie(name,value) { var Days = 30; var exp = new Date(); exp.setTime(exp.getTime() + Days*24*60*60*1000); document.cookie = name + "="+ escape (value) + ";expires=" + exp.toGMTString()+";path=/"; } function getCookie(Name) { var search = Name + "="; if (document.cookie.length > 0) { // if there are any cookies offset = document.cookie.indexOf(search); if (offset != -1) { // if cookie exists offset += search.length; // set index of beginning of value end = document.cookie.indexOf(";", offset); // set index of end of cookie value if (end == -1) end = document.cookie.length; return unescape(document.cookie.substring(offset, end)); } } } //default page should never load inside of another frame if (top.location != self.location) { top.location = self.location; } var oneDay= 1*24*60*60*1000; var expDate = new Date(); expDate.setTime (expDate.getTime() + oneDay); var cookieExpires = expDate.toGMTString(); document.cookie="verifyCookie=test; expires="+cookieExpires if (document.cookie.length<=0 || getCookie("verifyCookie") == null){ window.open('https://campus.webex.com/mw3300/mywebex/jsp/common/warningnote.jsp?siteurl=campus', 'Warning', 'toolbar=no,menubar=no,status=no,scrollbars=auto,resizable=yes,width=300,height=220'); } document.cookie="verifyCookie=CLEAR; expires=Sun, 09-Nov-97 01:00:00 GMT"; try{ if('/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("meetinginfo")!=-1||( '/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("e.do")!=-1&&'/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("siteurl")!=-1)&&'/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("landingpage.do")==-1&&'/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("mainframe.do")==-1&&'/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("mywebex")==-1&&'/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661'.indexOf("frame.do")==-1){ setCookie("jmtlogloginclicktime",new Date().getTime()); } }catch(ex){ } var dom = document.getElementById ? 1 : 0; var ns4 = (document.layers && !dom ) ? 1 : 0; // do for ns4 resize problem function mm_reloadPage(init) { //reloads the window if Nav4 resized if (init==true) { with (navigator) { if ((appName=="Netscape") && (parseInt(appVersion)==4)) { document.mm_pgW=innerWidth; document.mm_pgH=innerHeight; onresize=mm_reloadPage; } } } else if (innerWidth!=document.mm_pgW || innerHeight!=document.mm_pgH) { location.reload(); } } if(ns4) mm_reloadPage(true); document.cookie = "screenWidth=" + screen.width + "; path=/; secure"; function closeWindow() { window.close(opener=0); } function submitChildFrame(){ window.frames["mainFrame"].postChildForm("\x2fmw3300\x2fmywebex\x2floginframe.do\x3fsiteurl\x3dcampus\x26rnd\x3d0.8746987491314661"); } function submitChildFrame4Header(){ window.frames["header"].postChildForm4Logout(); } </script> <base href="https://campus.webex.com/mw3300/mywebex/jsp/frame/mywebex.jsp"> </HEAD> <!-- CDN Host: akamaicdn.webex.com Status: OK --> <FRAMESET id="topframeset" BORDER=0 FRAMEBORDER=0 FRAMESPACING=0 ROWS="131,*,0"> <FRAME SCROLLING="auto" NORESIZE NAME="header" SRC="/mw3300/mywebex/header.do?service=10&siteurl=campus&rnd=0.513406995989277" title="The header frame of Cisco WebEx Meetings"> <FRAME SCROLLING="auto" NORESIZE NAME="mainFrame" SRC="/mw3300/mywebex/loginframe.do?siteurl=campus&rnd=0.8746987491314661" target="_top" title="The content frame of Cisco WebEx Meetings"> <FRAME SCROLLING="no" NORESIZE NAME="rotation" SRC="/mw3300/mywebex/frame/clientpath.do?siteurl=campus" title="The clientPath frame of Cisco WebEx Meetings"> </FRAMESET> <noframes> <h2>Spiacenti.</h2> <p><b>Webex richiede l'uso di Netscape Navigator 4.0, Internet Explorer 4.0 o versioni successive.</b></p> </noframes> </html>
You could try using the By module: from selenium.webdriver.common.by import By from selenium import webdriver browser = webdriver.Firefox() browser.get("https://www.theurl.com") browser.find_element(By.NAME,"userName").send_keys("Test")
Thank you guys. Lukas's solution works fine ed I log in. But I don't understand why it works only for the link you used. In fact if I use this other link "https://campus.webex.com/mw3300/mywebex/default.do?siteurl=campus&service=7", that I need 'cos there are more functions, It doesn't work! the same code doesn't work! Why??? I switched to Python 2.7 but there are some ASCII problems.... Maybe this the reason?? This is my code: from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC browser = webdriver.Firefox() delay = 100 # seconds browser.get("https://campus.webex.com/mw3300/mywebex/default.do?siteurl=campus&service=7") myelem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, 'mwx-ipt-username'))) utente = browser.find_element_by_id("mwx-ipt-username").send_keys('User') psw = browser.find_element_by_id("mwx-ipt-password").send_keys('Psw') myelem.submit() but this code doesn't find the elements... why?? bye
Extract URL from webpage and save to disk
I am trying to write a script to automaotmcally query sci-hub.io with an article's title and save a PDF copy of the articles full text to my computer with a specific file name. To do this I have written the following code: url = "http://sci-hub.io/" data = read_csv("C:\\Users\\Sangeeta's\\Downloads\\distillersr_export (1).csv") for index, row in data.iterrows(): try: print('http://sci-hub.io/' + str(row['DOI'])) res = requests.get('http://sci-hub.io/' + str(row['DOI'])) print(res.content) except: print('NO DOI: ' + str(row['ref'])) This opens a CSV file with a list of DOI's and names of the file to be saved. For each DOI, it then queries sci-hub.io for the full-text. The presented page embeds the PDF in however I am now unsure how to extract the URL for the PDF and save it to disk. An example of the page can be seen in the image below: In this image, the desired URL is http://dacemirror.sci-hub.io/journal-article/3a257a9ec768d1c3d80c066186aba421/pajno2010.pdf. How can I automatically extract this URL and then save the PDF file to disk? When I print res.content, I get this: b'<!DOCTYPE html>\n<html>\n <head>\n <title></title>\n <meta charset="UTF-8">\n <meta name="viewport" content="width=device-width">\n </head>\n <body>\n <style type = "text/css">\n body {background-color:#F0F0F0}\n div {overflow: hidden; position: absolute;}\n #top {top:0;left:0;width:100%;height:50px;font-size:14px} /* 40px */\n #content {top:50px;left:0;bottom:0;width:100%}\n p {margin:0;padding:10px}\n a {font-size:12px;font-family:sans-serif}\n a.target {font-weight:normal;color:green;margin-left:10px}\n a.reopen {font-weight:normal;color:blue;text-decoration:none;margin-left:10px}\n iframe {width:100%;height:100%}\n \n p.agitation {padding-top:5px;font-size:20px;text-align:center}\n p.agitation a {font-size:20px;text-decoration:none;color:green}\n\n .banner {position:absolute;z-index:9999;top:400px;left:0px;width:300px;height:225px;\n border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px}\n .banner img {border:0}\n \n p.donate {padding:0;margin:0;padding-top:5px;text-align:center;background:green;height:40px}\n p.donate a {color:white;font-weight:bold;text-decoration:none;font-size:20px}\n\n #save {position:absolute;z-index:9999;top:180px;left:8px;width:210px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:#F0F0F0;color:#333}\n\n #save a {text-decoration:none;color:white;font-size:inherit;color:#666}\n\n #save p { margin: 0; padding: 0; margin-top: 8px}\n\n #reload {position:absolute;z-index:9999;top:240px;left:8px;width:210px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:#F0F0F0;color:#333}\n\n #reload a {text-decoration:none;color:white;font-size:inherit;color:#666}\n\n #reload p { margin: 0; padding: 0; margin-top: 8px}\n\n\n #saveastro {position:absolute;z-index:9999;top:360px;left:8px;width:230px;height:70px;\n border-radius: 4px; border: solid 1px #ccc; background: white; text-align:center}\n #saveastro p { margin: 0; padding: 0; margin-top: 16px}\n \n \n #donate {position:absolute;z-index:9999;top:170px;right:16px;width:220px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:white;color:#333}\n \n #donate a {text-decoration:none;color:green;font-size:inherit}\n\n #donatein {position:absolute;z-index:9999;top:220px;right:16px;width:220px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:green;color:#333}\n\n #donatein a {text-decoration:none;color:white;font-size:inherit}\n \n #banner {position:absolute;z-index:9999;top:50%;left:45px;width:250px;height:250px; padding: 0; border: solid 1px white; border-radius: 4px}\n \n </style>\n \n \n \n <script type = "text/javascript">\n window.onload = function() {\n var url = document.getElementById(\'url\');\n if (url.innerHTML.length > 77)\n url.innerHTML = url.innerHTML.substring(0,77) + \'...\';\n };\n </script>\n <div id = "top">\n \n <p class="agitation" style = "padding-top:12px">\n \xd0\xa1\xd1\x82\xd1\x80\xd0\xb0\xd0\xbd\xd0\xb8\xd1\x87\xd0\xba\xd0\xb0 \xd0\xbf\xd1\x80\xd0\xbe\xd0\xb5\xd0\xba\xd1\x82\xd0\xb0 Sci-Hub \xd0\xb2 \xd1\x81\xd0\xbe\xd1\x86\xd0\xb8\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xbd\xd1\x8b\xd1\x85 \xd1\x81\xd0\xb5\xd1\x82\xd1\x8f\xd1\x85 \xe2\x86\x92 <a target="_blank" href="https://vk.com/sci_hub">vk.com/sci_hub</a>\n </p>\n \n </div>\n \n <div id = "content">\n <iframe src = "http://moscow.sci-hub.io/202d9ebdfbb8c0c56964a31b2fdfe8e9/roerdink2016.pdf" id = "pdf"></iframe>\n </div>\n \n <div id = "donate">\n <p><a target = "_blank" href = "//sci-hub.io/donate">\xd0\xbf\xd0\xbe\xd0\xb4\xd0\xb4\xd0\xb5\xd1\x80\xd0\xb6\xd0\xb0\xd1\x82\xd1\x8c \xd0\xbf\xd1\x80\xd0\xbe\xd0\xb5\xd0\xba\xd1\x82 →</a></p>\n </div>\n <div id = "donatein">\n <p><a target = "_blank" href = "//sci-hub.io/donate">support the project →</a></p>\n </div>\n <div id = "save">\n <p>\xe2\x87\xa3 \xd1\x81\xd0\xbe\xd1\x85\xd1\x80\xd0\xb0\xd0\xbd\xd0\xb8\xd1\x82\xd1\x8c \xd1\x81\xd1\x82\xd0\xb0\xd1\x82\xd1\x8c\xd1\x8e</p>\n </div>\n <div id = "reload">\n <p>↻ \xd1\x81\xd0\xba\xd0\xb0\xd1\x87\xd0\xb0\xd1\x82\xd1\x8c \xd0\xb7\xd0\xb0\xd0\xbd\xd0\xbe\xd0\xb2\xd0\xbe</p>\n </div>\n \n \n<!-- Yandex.Metrika counter --> <script type="text/javascript"> (function (d, w, c) { (w[c] = w[c] || []).push(function() { try { w.yaCounter10183018 = new Ya.Metrika({ id:10183018, clickmap:true, trackLinks:true, accurateTrackBounce:true, ut:"noindex" }); } catch(e) { } }); var n = d.getElementsByTagName("script")[0], s = d.createElement("script"), f = function () { n.parentNode.insertBefore(s, n); }; s.type = "text/javascript"; s.async = true; s.src = "https://mc.yandex.ru/metrika/watch.js"; if (w.opera == "[object Opera]") { d.addEventListener("DOMContentLoaded", f, false); } else { f(); } })(document, window, "yandex_metrika_callbacks"); </script> <noscript><div><img src="https://mc.yandex.ru/watch/10183018?ut=noindex" style="position:absolute; left:-9999px;" alt="" /></div></noscript> <!-- /Yandex.Metrika counter -->\n </body>\n</html>\n' Which does include the URL, however I am unsure how to extract it. Update: I am now able to extract the URL but when I try to access the page with the PDF (through urllib.request) I get a 403 response even though the URL is valid. Any ideas on why and how to fix? (I am able to access through my browser so not IP blocked)
You can use urllib library to access the html of the page and even download files, and regex to find the url of the file you want to download. import urllib import re site = urllib.urlopen(".../index.html") data = site.read() # turns the contents of the site into a string files = re.findall('(http|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,#?^=%&:/~+#-]*[\w#?^=%&/~+#-])?(.pdf)', data) # finds the url for file in files: urllib.urlretrieve(file, filepath) # "filepath" is where you want to save it
Here is the Solution:- url = re.search('<iframe src = "\s*([^"]+)"', res.content) url.group(1) urllib.urlretrieve(url.group(1),'C:/.../Docs/test.pdf') I ran it and it is working :) For Python 3: Change urrlib.urlretrive to urllib.request.urlretrieve
You can do it with a clunky code requiring selenium, requests and scrapy. Use selenium to request either an article title or DOI. >>> from selenium import webdriver >>> driver.get("http://sci-hub.io/") >>> input_box = driver.find_element_by_name('request') >>> input_box.send_keys('amazing scientific results\n') An article by the title 'amazing scientific results' doesn't seem to exist. As a result, the site returns a diagnostic page in the browser window which we can ignore. It also puts 'http://sci-hub.io/' in webdriver's current_url property. This is helpful because it's an indication that the requested result isn't available. >>> driver.current_url 'http://sci-hub.io/' Let's try again, looking for the item that you know exists. >>> driver.get("http://sci-hub.io/") >>> input_box = driver.find_element_by_name('request') >>> input_box.send_keys('DOI: 10.1016/j.anai.2016.01.022\n') >>> driver.current_url 'http://sci-hub.io/10.1016/j.anai.2016.01.022' This time the site returns a distinctive url. Unfortunately, if we load this using selenium we will get the pdf and, unless you're more able than I am, you will find it difficult to download this to a file on your machine. Instead, I download it using the requests library. Loaded in this form you will find that the url of the pdf becomes visible in the HTML. >>> import requests >>> r = requests.get(driver.current_url) To ferret out the url I use scrapy. >>> from scrapy.selector import Selector >>> selector = Selector(text=r.text) >>> pdf_url = selector.xpath('.//iframe/#src')[0].extract() Finally I use requests again to download the pdf so that I can save it to a conveniently named file on local storage. >>> r = requests.get(pdf_url).content >>> open('article_name', 'wb').write(r) 211853
I solved this using a combination of the answers above - namely SBO7 & Roxerg. I use the following to extract the URL from the page and then download the PDF: res = requests.get('http://sci-hub.io/' + str(row['DOI'])) useful = BeautifulSoup(res.content, "html5lib").find_all("iframe") urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(useful[0])) response = requests.get(urls[0]) with open("C:\\Users\\Sangeeta's\\Downloads\\ref\\" + str(row['ref']) + '.pdf', 'wb') as fw: fw.write(response.content) Note: This will not work for all articles - some link to webpages (example) and this doesn't correctly work for those.