Python - saving issue while scraping (Selenium, Chrome Driver) - python

I've recently started learning Python.
I have a file of 1000 words for which I want to do direct searches on a website, and extract the number of results for each word on the search results page.
I'm using Selenium and Chrome Driver, on Mac.
My script runs well, manages to input the keyword, submit the search, retrieve the output and save it correctly, c.f. screenshot of the output.
However past a certain point, I have no idea why, it starts saving the same output for a bunch of keywords.
Could that be due to wifi, Chrome driver? I have tried: Running the script while not using the computer, restarting my whole setup in between script runs, changing network, running the code in several batches, into smaller lists of keywords... I have no idea why it would stop behaving correctly past a certain point, would really appreciate any idea to solve this!
Script and imports below:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import time
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import selenium.webdriver.support.ui as ui
import selenium.webdriver.support.expected_conditions as EC
from tqdm.notebook import tqdm
import re
URL = "XXX"
# Add list of words i.e. pull from csv.
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
dir_path= '/Users/XXXXX/'
chromedriver = dir_path + "/chromedriver"
header_list=['Word']
words_df = pd.read_csv('/Users/XXXXX/results-file.csv', names=header_list)
words_list = words_df.Word.tolist()
os.environ["webdriver.chrome?driver"] = chromedriver
# Start the Driver
driver = webdriver.Chrome(options=options, executable_path = chromedriver)
# Hit the url and wait for 2 seconds.
time.sleep(2)
driver.get(URL)
# Empty dictionnary is created to store the data
dictionnary_numberresults = {}
for element in words_list[3000:10000]:
# Enter word in searchbox, and wait for 4 seconds.
driver.find_element_by_xpath("//input[#id='twotabsearchtextbox']").clear()
driver.find_element_by_xpath("//input[#id='twotabsearchtextbox']").send_keys(element)
driver.find_element_by_xpath("//input[#id='twotabsearchtextbox']").submit()
time.sleep(3)
# Web page fetched from driver is parsed using Beautiful Soup.
HTMLPage = BeautifulSoup(driver.page_source, 'html.parser')
Pagination = HTMLPage.find_all(class_="a-section a-spacing-small a-spacing-top-small")
Number = re.findall('<span dir="auto">(.*)</span><span dir="auto">', str(Pagination), re.DOTALL)
if re.findall('<span dir="auto">(.*)</span><span dir="auto">', str(Pagination), re.DOTALL) == []:
Number = ['No results']
dictionnary_numberresults[element] = Number[0]

Related

I want to use Selenium to find specific text

I was going to use Selenium to crawl the web
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('./chromedriver', options=options)
driver.get('https://steamdb.info/tag/1742/?all')
driver.implicitly_wait(3)
li = []
games = driver.find_elements_by_xpath('//*[#class="table-products.text-center.dataTable"]')
for i in games:
time.sleep(5)
li.append(i.get_attribute("href"))
print(li)
After accessing the steam url that I was looking for, I tried to find something called an appid
The picture below is the HTML I'm looking for
I'm trying to find the number next to "data-appid="
But if I run my code, nothing is saved in the "games"
Correct me if I'm wrong but from what I can see this steam page requires you to log-in, are you sure that when webdriver opens the page that same data is available to you ?
Additionally when using By, the correct syntax would be games = driver.find_element(By.CSS_SELECTOR('//*[#class="table-products.text-center.dataTable"]'))

Grab CSV file from pop-up windows with Python

I am currently working on a project where I need to extract many files from a database, for which there is no API.
I need to do it through a webpage by constructing URL's similar to this one:
https://bmsnet.cas.dtu.dk/Trendlogs/ExportCSV_TrendlogRecordData/1
The integer at the end of the URL (in the example above: 1), will be ranging from 1 to 35000. When constructing the URL, I get a pop-up windows for saving the file such as:
Pop-up window for file download
My question is how do I automate that process using python. I am capable of generating these URLs and handle the data resulting from the file download (so far when doing this manually). The step I am stuck at, is for constructing a python command/bit of code that allows me to click on the save as button. Eventually I want to end up with a code doing the following:
Construct the URL
Save the file arising from the pop-up window
Load/read and process the data
EDIT :
I have now found a solution using Selenium.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pyautogui
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
dl_path = "MY_LOCAL_DOWNLOAD_PATH"
profile = FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", dl_path)
profile.set_preference("browser.helperApps.neverAsk.saveToDisk",
"text/plain,text/x-csv,text/csv,application/vnd.ms-excel,application/csv,application/x-csv,text/csv,text/comma-separated-values,text/x-comma-separated-values,text/tab-separated-values,application/pdf")
driver = webdriver.Firefox(firefox_profile=profile)
URL = "https://bmsnet.cas.dtu.dk"
driver.get(URL)
# Let the page load
time.sleep(5)
username = driver.find_element_by_id("Email")
password = driver.find_element_by_id("Password")
username.send_keys("my_username")
password.send_keys("my_password")
elem = driver.find_element_by_xpath("/html/body/div[2]/div/div[1]/section/form/div[4]/div/input")
elem.click()
time.sleep(5)
start = 1
stop = 10
for file_integer in range(start, stop):
URL = "https://bmsnet.cas.dtu.dk/Trendlogs/ExportCSV_TrendlogRecordData/{0}".format(file_integer)
driver.get(URL)
time.sleep(5)
print('Done downloading integer: {0}'.format(file_integer))
The above code works but only once. For some reason the for loop gets stuck after the first iteration. Any clue on what I am doing wrong there?
Thank you for your time and help. Looking forward to hearing your ideas on that.

beautifulsoup scrape realtime values

i am trying to scrape the currency rates for a personal project, i used css selector to get the class where the values are. There's a javascript providing those values on the website and it seems i am noot too connversant with the developers console, i checked it out and i could not see anything running in real time in the networks section. This is the code i wrote, so far, it brings out a long list of dashes. surprisingly, the dashes match the source code for those parts were the rates are supposed to show.
from bs4 import BeautifulSoup
import requests
r = requests.get("https://www.ig.com/en/forex/markets-forex")
soup = BeautifulSoup(r.content, "html.parser")
results = soup.findAll("span",attrs={"data-field": "CPT"})
for span in results:
print(span.text)
Span-elements filling via JS, dynamic values. On start each span-element contains '-'.
You need js driver for wait to fill elements and then get values from spans.
With selenium:
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome('./chromedriver')
driver.get('https://www.ig.com/en/forex/markets-forex')
for elm in driver.find_elements(By.CSS_SELECTOR, "span[data-field=CPT]"):
print(elm, elm.text)
chromedriver download from https://sites.google.com/a/chromium.org/chromedriver/home
Also, dryscrape + bs4, but dryscrape seems outdated. Example here
Modified:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome('./chromedriver')
driver.get('https://www.ig.com/en/forex/markets-forex')
time.sleep(2) # Maybe more or less, how much faster page load
for elm in driver.find_elements(By.CSS_SELECTOR, "span[data-field=CPT]"):
if elm.text:
print(elm, elm.text)
or
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome('./chromedriver')
driver.get('https://www.ig.com/en/forex/markets-forex')
data = []
while not data:
for elm in driver.find_elements(By.CSS_SELECTOR, "span[data-field=CPT]"):
if elm.text and elm.text != '-': # Maybe check on contains digit
data.append(elm.text)
time.sleep(1)
print(data)

difference between chromedirver and phantomjs with python

I’m working to make web crawler with python by using selenium
Here, I successfully got contents by using chromedriver, but problem occurred when I tried to make
Headless access crawling through PhantomJS. find_element_by_id, or find_element_by_name did not work
Is there any difference between these? Actually I am trying to make this as headless because I want to run this
Code in ubuntu server as a batch job without GUI support.
My script is like as below.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
#driver = webdriver.PhantomJS('/Users/user/Downloads/phantomjs-2.1.1-macosx/bin/phantomjs')
#driver = webdriver.Chrome('/Users/user/Downloads/chromedriver')
driver = webdriver.PhantomJS()
driver.set_window_size(1120, 550)
driver.get(url)
driver.implicitly_wait(3)
#here I tried two different find_tag things but both didn’t work
user = driver.find_element(by=By.NAME,value="user:email")
password = driver.find_element_by_id('user_password')

How to 'login' to secure website using python for a webpage that uses java and does not have any real source when viewing page source

I would like to make a script to login into my bank and download the latest transactions but I am stumped at the login page as it seems to be dynamically created each time and very little helpful information in the page source. I am new to using Python for this task and would appreciate any direction on how to do this. I get that most websites login would use some form of post to send the username and password but my bank has done something crazy to be secure which I guess is fine but it is not helping my cause....
This is the login page I would like to login into using Python.
https://www.txn.banking.pcfinancial.ca/ebm-resources/public/client/web/index.html#/signon
after a bit more web scraping experience under my belt I have revisited this challenge and have come up with a solution using Selenium web driver with firefox. It works quite well, however there definitely could be improvements made like replacing the hard waits for something more elegant to get Selenium to wait for the page to reload but it gives a good idea to anyone else that might be interested in automating the downloading and loading of their banking information from PC Financial (which is Simplii now). I do this as I import this information into Beancount/fava to monitor my finances closely.
here is my current working script...it will download all transactions since last download for select accounts that match on last 4 digits and rename the downloaded file using the last 4 digits of the account and the current date.
# import libraries
import re
from contextlib import closing
from selenium.webdriver import Firefox # pip install selenium
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
import time
import os
from datetime import datetime
current_date = datetime.today().strftime('%Y-%m-%d')
# specify the url
urlpage = 'https://online.simplii.com/ebm-resources/public/client/web/index.html#/signon'
print(urlpage)
dl_dir = "/home/user/bank_statements"
profile = FirefoxProfile()
profile.set_preference("browser.download.panel.shown", False)
profile.set_preference("browser.helperApps.neverAsk.openFile","text/csv")
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")
profile.set_preference("browser.download.folderList", 2);
profile.set_preference("browser.download.dir", dl_dir)
# list of last 4 digits of account numbers to download statements for
account_filter = ['1234','4321','9999']
with closing(Firefox(firefox_profile=profile)) as driver:
driver.get(urlpage)
time.sleep(7)
# enter card number
driver.find_element_by_xpath("//input[#name='cardNumber']").send_keys('123456789')
# enter online banking passward
driver.find_element_by_xpath("//input[#type='password']").send_keys('yourpassword')
driver.find_element_by_xpath("//div[text()='Sign In']").click()
time.sleep(5)
driver.find_element_by_link_text('Download Transactions').click()
time.sleep(2)
act_section = driver.find_element_by_class_name('account-section')
act_select = Select(act_section.find_element_by_tag_name('select'))
options = act_select.options
for act in range(1, len(options)-1):
# extract last 4-digits of account number
last4_search = re.search(r'.+\(\d{6}(\d{4})\)\s+\$', options[act].text)
if last4_search:
last4 = last4_search.group(1)
if last4 in account_filter:
print('downloading transactions for: ...' + last4)
act_select.select_by_index(act)
last_dl = driver.find_element_by_xpath("//input[#type='radio'][#value='LAST_DOWNLOAD']/ancestor::ui-radiobutton")
result = last_dl.find_element_by_xpath("//input[#type='radio'][#value='LAST_DOWNLOAD']").is_selected()
if result == False:
last_dl.click()
format_section = driver.find_element_by_class_name('format-section')
format_select = Select(format_section.find_element_by_tag_name('select'))
format_select.select_by_index(3)
# initiate download
driver.find_element_by_xpath("//div[text()='Download Transactions']").click()
time.sleep(10)
# rename file
if os.path.isfile(dl_dir + '/SIMPLII.csv'):
os.rename(dl_dir + '/SIMPLII.csv', dl_dir + '_'.join(['/SIMPLII',last4,current_date,'.csv']))

Categories

Resources