Read csv file from url - python

I want to read url but they give me error that URL must be a string is there any possible solution for that kindly recommend me how to make the URL string kindly suggest any method
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
with open("data.csv") as file:
start_urls=[line.strip() for line in file]
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(start_urls)

As commented by #JeJe,
you need to iterate through your start_urls list
start_urls is a LIST of strings, and driver.get expects just one url as string - something like
# driver.get(start_urls) # NOT like this
for sUrl in start_urls:
driver.get(sUrl)
### SCRAPE AS NEEDED ###
or, if you want to keep track of progress, something like
suLen = len(start_urls)
for si, sUrl in enumerate(start_urls):
print(f'[{si} of {suLen}] reading {sUrl}')
driver.get(sUrl)
### SCRAPE AS NEEDED ###
Btw, you don't need to do everything under with open... - getting start_urls is enough:
with open("data.csv") as file:
start_urls=[line.strip() for line in file]
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
suLen = len(start_urls)
for si, sUrl in enumerate(start_urls):
print(f'[{si} of {suLen}] reading {sUrl}')
driver.get(sUrl)
### SCRAPE AS NEEDED ###
but you can only use just start_urls=[line.strip() for line in file] if you csv has only on column with no headers.

you could rewrite your code using functions to deal with each step of your code.
def read_file(has_header:bool=True):
with open("data.csv") as file:
start_urls = [line.strip() for line in file]
# assumes there is a header
if has_header:
return start_urls[1:]
else:
return start_urls
def scrap_urls():
urls = read_file(has_header=True)
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
for url in urls:
driver.get(url)
if __name__ == '__main__':
scrap_urls()

Related

Write data in csv file but data are overwritten

Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you. these is the page link https://www.askgamblers.com/online-casinos/countries/ca/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/ca/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for url in urls:
driver.get(url)
time.sleep(1)
try:
name=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
try:
company=driver.find_element(By.XPATH,"//p[span[contains(.,'Company')]]/following-sibling::div").text
except:
pass
try:
link=driver.find_element(By.XPATH,"//p[span[contains(.,'Website')]]/following-sibling::div").text
except:
pass
try:
rate=driver.find_element(By.CSS_SELECTOR,"span.rating-ring__number").text
except:
pass
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)
You open the same file for (over)writing with 'w' each time but loop over 3 pages. Use a different name or use 'a' (append) instead, but you will get the header three times as well with the current configuration.
Better would be to open the file for writing outside the for page loop, write the header, then inside for page write the rows.
Basically:
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for page in range(1,3):
... # compute the row info
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)

locating element via selenium explicitly waiting and without gives 0

Using this url I want to locate div tags which has attribute data-asin . When I use //div[#data-asin] in Chrome Inspect mode it gives 21 elements. But while trying to get these elements via Selenium in both ways, explicit wait and direct length gives 0. As I guess Selenium remote browser is unable to get anyone of these elements as a DOM tree. code is below
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#reading from csv file url-s
def readCSV(path_csv):
df=pd.read_csv(path_csv)
return df
fileCSV=readCSV(r'C:\Users\Admin\Downloads\urls.csv')
length_of_column_urls=fileCSV['linkamazon'].last_valid_index()
def create_driver():
chrome_options = Options()
chrome_options.headless = True
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
webdriver_service = Service(r'C:\Users\Admin\Downloads\chromedriver107v\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
return driver
#going to urls 1-by-1
def goToUrl_Se(driver):
global counter
counter = 0
for i in range(0, length_of_column_urls + 1):
xUrl = fileCSV.iloc[i, 1]
print(xUrl,i)
# going to url(amazn) via Selenium WebDriver
driver.get(xUrl)
parse_data()
counter+=1
driver.quit()
#fetch-parse the data from url page
def parse_data():
global asin, title, bookform, priceNewProd,author
wait=WebDriverWait(driver,timeout=77)
try:
x_index=wait.until(EC.visibility_of_all_elements_located((By.TAG_NAME,'//div[#data-asin]')))###Attention here
print(len(x_index))
except:
y_index=driver.find_elements(By.TAG_NAME,'//div[#data-asin]')###Anf attention here
print(len(y_index))
driver=create_driver()
goToUrl_Se(driver)
You have to mention XPATH not TAG_NAME:
try:
x_index=wait.until(EC.visibility_of_all_elements_located((By.XPATH,'//div[#data-asin]')))###Attention here
print(len(x_index))
except:
y_index=driver.find_elements(By.XPATH,'//div[#data-asin]')###Anf attention here
print(len(y_index))

Select first element of srcset with python selenium

Using selenium in Python, I have been able to successfully access some url's of an image I want to download. However, the image link is stored within a srcset image attribute. When I use get_attribute('srcset'), it returns a string with the 4 links. I just want the one. How would I go about doing this? Could I possibly just crop the string afterwards?
Here's the site that I am scraping from:
https://www.politicsanddesign.com/
Here is my code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
import pyautogui
import time
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
driver.get('https://www.politicsanddesign.com/')
img_url = driver.find_element(By.XPATH, "//div[#class = 'responsive-image-wrapper']/img").get_attribute("srcset")
driver.get(img_url)
And here is what the img_url object looks like:
//images.ctfassets.net/00vgtve3ank7/6f38yjnNcU1d6dw0jt1Uhk/70dfbf208b22f7b1c08b7421f910bb36/2020_HOUSE_VA-04_D-MCEACHIN..jpg?w=400&fm=jpg&q=80 400w, //images.ctfassets.net/00vgtve3ank7/6f38yjnNcU1d6dw0jt1Uhk/70dfbf208b22f7b1c08b7421f910bb36/2020_HOUSE_VA-04_D-MCEACHIN..jpg?w=800&fm=jpg&q=80 800w, //images.ctfassets.net/00vgtve3ank7/6f38yjnNcU1d6dw0jt1Uhk/70dfbf208b22f7b1c08b7421f910bb36/2020_HOUSE_VA-04_D-MCEACHIN..jpg?w=1200&fm=jpg&q=80 1200w, //images.ctfassets.net/00vgtve3ank7/6f38yjnNcU1d6dw0jt1Uhk/70dfbf208b22f7b1c08b7421f910bb36/2020_HOUSE_VA-04_D-MCEACHIN..jpg?w=1800&fm=jpg&q=80 1800w
But I'd like it to just be:
//images.ctfassets.net/00vgtve3ank7/6f38yjnNcU1d6dw0jt1Uhk/70dfbf208b22f7b1c08b7421f910bb36/2020_HOUSE_VA-04_D-MCEACHIN..jpg?w=400&fm=jpg&q=80
The image seems to have an attribute called currentSrc which hold only the current value.
img_url = driver.find_element(By.XPATH, "//div[#class = 'responsive-image-wrapper']/img").get_attribute("currentSrc")
driver.get(img_url)
You can simply split the value extracted from that web element.
As following:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
import pyautogui
import time
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
driver.get('https://www.politicsanddesign.com/')
img_url = driver.find_element(By.XPATH, "//div[#class = 'responsive-image-wrapper']/img").get_attribute("srcset")
img_urls = img_url.split(",")
Now img_urls is a list containing 3 URLs, so you can use it as following:
driver.get(img_urls[0]) #open the first URL
driver.get(img_urls[1]) #open the second URL
driver.get(img_urls[2]) #open the third URL
My inefficient solution:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
import pyautogui
import time
# WILL NEED TO EVENTUALLY FIGURE OUT HOW TO WRAP ALL OF THIS INTO A FUNCTION OR LOOP TO DO IT FOR ALL DIV OBJECTS
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options = chrome_options)
driver.get('https://www.politicsanddesign.com/')
img_url = driver.find_element(By.XPATH, "//div[#class = 'responsive-image-wrapper']/img").get_attribute("srcset")
driver.get(img_url)
img_url2 = 'https:' + img_url.split(' 400w',1)[0]
driver.get(img_url2)

How would I loop through elements Python Selenium

I have a website for which I want to download excel files. (https://www.rivm.nl/media/smap/eenzaamheid.html)
First I want to click on the region and then perform the download. This I have working.
wijk_keuze = WebDriverWait(driver2nd,20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='highcharts-container ']//*[name()='svg']//*[name()='g']//*[name()='path']")))
wijk_keuze.click()
download = WebDriverWait(driver2nd, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='highcharts-container ']//*[name()='svg']//*[name()='g' and #aria-label='View export menu']//*[name()='rect']")))
download.click()
WebDriverWait(download, 10).until(EC.visibility_of_element_located((By.XPATH, "//div[#class='highcharts-menu']//*[contains(text(),'XLS downloaden')]"))).click()
time.sleep(2)
above code selects the first region in the parent element and then downloads the excel.
What I want to do is loop through each element in the parent element. How would I go about doing so?
The parent element looks as follows:
<g transform="transform(0,0), scale(1,1)" animator="1">
my entire code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
#defining URL
url='https://www.rivm.nl/media/smap/eenzaamheid.html'
#defining driver
driver = webdriver.PhantomJS(r'./phantomjs-2.1.1-windows/bin/phantomjs')
options = webdriver.ChromeOptions()
options.add_argument("start-maximized");
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options, executable_path = r'./chromedriver_win32/chromedriver')
driver.get(url)
# Gemeentes
Detail_keuze = Select(driver.find_element_by_id("detail"))
options = Detail_keuze.options
Indicator_keuze = Select(driver.find_element_by_id("indicator"))
indicator_options = Indicator_keuze.options
for index in range(0, len(indicator_options) ):
#defining URL
url='https://www.rivm.nl/media/smap/eenzaamheid.html'
#defining driver
driver2nd = webdriver.PhantomJS(r'./phantomjs-2.1.1-windows/bin/phantomjs')
options = webdriver.ChromeOptions()
options.add_argument("start-maximized");
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_experimental_option("prefs", {
"download.default_directory": r"MY_PATH",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
})
driver2nd = webdriver.Chrome(options=options, executable_path = r'./chromedriver_win32/chromedriver')
driver2nd.get(url)
# Gemeentes
Detail_keuze = Select(driver2nd.find_element_by_id("detail"))
options = Detail_keuze.options
Indicator_keuze = Select(driver2nd.find_element_by_id("indicator"))
indicator_options = Indicator_keuze.options
time.sleep(1)
Indicator_keuze.select_by_index(index)
wijk_keuze = WebDriverWait(driver2nd,20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='highcharts-container ']//*[name()='svg']//*[name()='g']//*[name()='path']")))
wijk_keuze.click()
download = WebDriverWait(driver2nd, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='highcharts-container ']//*[name()='svg']//*[name()='g' and #aria-label='View export menu']//*[name()='rect']")))
download.click()
WebDriverWait(download, 10).until(EC.visibility_of_element_located((By.XPATH, "//div[#class='highcharts-menu']//*[contains(text(),'XLS downloaden')]"))).click()
time.sleep(2)
######## HERE I WANT TO LOOP THROUGH EACH AND EVERY REGION
driver2nd.close()
As you can see I also want to loop through eachh and every indicator. This works. Now I want to add a loop through each and every region. I have it working so that I can click on the first region.
You don't have to click on the options. You can get the details by changing the url 'https://www.rivm.nl/media/smap/{indicator}?detail={detail}'
Just add the logic for downloading it.
Try:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from itertools import product
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install())#, chrome_options=chrome_options)
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('https://www.rivm.nl/media/smap/eenzaamheid.html')
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
ind = soup.find('select', attrs = {'name': 'indicator'} )
indicators = [i['value'] for i in ind.findAll('option')]
det = soup.find('select', attrs = {'name': 'detail'})
details = [i['value'] for i in det.findAll('option')]
for detail, indicator in list(product(details, indicators)):
print(indicator, detail)
new_url = f'https://www.rivm.nl/media/smap/{indicator}?detail={detail}'
driver.get(new_url)
# Write code for downloading it

How to scrape and extract links to n level and scrape the data again and map it to output in python?

I am learning web crawling and scraping in python. I want to scrape data where in a site there are links, and inside those links there are more links. So I want to scrape data till predefined level n.
This is my basic code
import requests
from selenium import webdriver
from requests_ntlm import HttpNtlmAuth
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from webdrivermanager import GeckoDriverManager
import pickle
from selenium.webdriver.common.keys import Keys
from urllib.parse import urljoin
from seleniumrequests import Chrome
options = Options()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--profile-directory=Default")
driver = webdriver.Chrome("./chromedriver",options=options)
web_url = 'https://spaceflightnow.com/'
driver.get("https://spaceflightnow.com/")
time.sleep(5)
soup = BeautifulSoup(driver.page_source,"lxml")
#section = soup.section
links=[]
for url in soup.find_all('a',href=True):
links.append(urljoin(web_url,url.get('href')))
#print(urljoin(web_url,url.get('href')))
links = list(filter(lambda x: x != web_url,links))
print(links)
This prints multiple links of first page. Now I want to click and go to all the links in subsequent level and scrape it again,getting more links inside. There is the possibility of same links getting displayed again internally from news feed. So what I want to know is what should be my approach to do it. I can understand I need a tree, but cannot figure out exactly how ?
Like I create a list inside list, but how to do it dynamically till n level ? and how to map it with the data saved in file ?? Can anyone help me with this ? maybe with a sample solution ?
Thank you :)
I made example which work without recursion - I would say it is similar to Breadth-First Search algorithm.
It keeps urls on list [(url, level),...] to control level and in set() to filter visited page. It also filters links to external pages.
Tested with Firefox.
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# ---
def get_links(driver, url):
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source,"lxml")
links = []
for new_url in soup.find_all('a', href=True):
new_url = new_url.get('href')
new_url = urljoin(url, new_url)
links.append(new_url)
return links
# ---
options = Options()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--profile-directory=Default")
driver = webdriver.Chrome("./chromedriver",options=options)
#driver = webdriver.Firefox()
# ---
domain = 'https://spaceflightnow.com/' # to filter external links
start_url = 'https://spaceflightnow.com/'
max_level = 2
links_visited = set([start_url]) # to test visited links
links_with_levels = [(start_url, 0)] # to control levels
# ---
for link, level in links_with_levels:
if level >= max_level:
print('skip:', level, link)
continue
print('visit:', level, link)
links = get_links(driver, link)
print('found:', len(links))
links = list(set(links) - links_visited)
print('after filtering:', len(links))
level += 1
for new_link in links:
if new_link.startswith(domain): # filter external links
links_visited.add(new_link)
links_with_levels.append( (new_link, level) )
# ---
for link, level in links_with_levels:
print('skip:', level, link)

Categories

Resources