I'm trying to save the data from a profile on Google Scholar into a CSV. The profile has a ‘Show More’ button, and I can get all the data from it (here I only saved the data from the table but I need all the data from the profile) but the problem is that I saved the data twice or even more times sometimes, and I think it's because I saved it while I was clicking and not after I had clicked all the ‘Show More’, so how can I do that? Also, here I used only one URL, but there are more, and I have them saved in another CSV, so how do I open each URL from there to do what I do here? (I only need the Link row) the CSV with the URLs look like this
https://drive.google.com/file/d/1zkTlzYaOQ7FVoSdd5OMnE8QgwS8NOik7/view?usp=sharing
from selenium.webdriver.support.ui import WebDriverWait as W
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions as SE
from selenium import webdriver
import time
from csv import writer
chrome_path=r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
urls = ["https://scholar.google.com/citations?hl=en&user=gQb_tFMAAAAJ"]
button_locators = "//button[#class='gs_btnPD gs_in_ib gs_btn_flat gs_btn_lrge gs_btn_lsu']"
wait_time = 2
wait = W(driver, wait_time)
for url in urls:
data = {}
driver.get(url)
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
while button_link:
try:
wait.until(EC.visibility_of_element_located((By.ID,'gsc_a_tw')))
data = driver.find_elements_by_class_name("gsc_a_tr")
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
button_link.click()
time.sleep(2)
with open('perfil.csv','a', encoding="utf-8", newline='') as s:
csv_writer =writer(s)
for i in range(len(data)):
paper = driver.find_elements_by_class_name("gsc_a_t")
citas = driver.find_elements_by_class_name("gsc_a_c")
año = driver.find_elements_by_class_name("gsc_a_y")
p = paper[i].text.replace(',', '')
c = citas[i].text.replace(',', '')
a = año[i].text.replace(',', '')
csv_writer.writerow([ p, c, a])
except SE.TimeoutException:
print(f'Página parseada {url}')
break
driver.quit()
For the first part I didn't really get what's happening. But for the second part you can change URLs from hard code to a function (put the loop in function) and you can use pandas library for CSV (it's much better). This is for getting the URLS
import pandas as pd
df = pd.read_csv(csv_file)
urls = df['column_name']
Here is the most basic way read data from CSV file:
import csv
with open('filename.csv', 'r') as file:
reader = csv.reader(filename)
for row in reader:
print(row)
Related
Now I am having an issue where the next page is not loading completely for me to take the information down fully and the script is coming up with a NoMatchingElement error halfway through. I tried adding a WebDriverWait as shown below in the code, but it is not working. Any help is greatly appreciated. Thank you!
import sys
import csv
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# default path to file to store data
path_to_file = "/Users/Dan/Desktop/reviews.csv"
# default number of scraped pages
num_page = 1939
# default tripadvisor website of hotel or things to do (attraction/monument)
url = "https://www.tripadvisor.com/Attraction_Review-g187791-d192285-Reviews-Colosseum-Rome_Lazio.html"
# if you pass the inputs in the command line
if (len(sys.argv) == 4):
path_to_file = sys.argv[1]
num_page = int(sys.argv[2])
url = sys.argv[3]
# import the webdriver
driver = webdriver.Safari()
driver.get(url)
# open the file to save the review
csvFile = open(path_to_file, 'a', encoding="utf-8")
csvWriter = csv.writer(csvFile)
# change the value inside the range to save more or less reviews
data=[]
for i in range(0, num_page):
# expand the review
wait = WebDriverWait(driver, 10)
element = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[#id='tabs-1']/div[2]/a[#accesskey='n']")))
container = driver.find_elements_by_xpath("//*[#id='tabs-1']/div[3]/table/tbody")
for con in container:
name = con.find_element_by_xpath(".//tr[2]/td").text.replace("Sponsor Name:","")
start = con.find_element_by_xpath(".//tr[1]/td[3]").text.replace("Start Date*: ","")
data.append((name,start))
df = pd.DataFrame(data,columns=['Name','Start'])
df.to_csv('/Users/Dan/Desktop/reviews.csv', index = False)
driver.find_element_by_xpath("//*[#id='tabs-1']/div[2]/a[#accesskey='n']").click()
driver.quit()
I am not sure how to do it with csv writer but I know that you can do it like so. Just append to 2d list and then write to csv file.
driver.get("https://www.clinicaltrialsregister.eu/ctr-search/search?query=")
container = driver.find_elements_by_xpath("//*[#id='tabs-1']/div[3]/table/tbody")
data=[]
for con in container:
name = con.find_element_by_xpath(".//tr[2]/td").text
start = con.find_element_by_xpath(".//tr[1]/td[3]").text
data.append((name,start))
df = pd.DataFrame(data,columns=['Name','Start'])
#print(df)
df.to_csv('/Users/Dan/Desktop/reviews.csv', index = False)
Import
import pandas as pd
I am trying to scrape a website to get the info and output it to a CSV file. For the data I am trying to extract, there is an output to the terminal but I need that to be in a CSV file.
I have tried several different methods but cannot find a solution.
The CSV file is created but it's just empty. There is probably something really simple.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import time
from bs4 import BeautifulSoup
DRIVER_PATH = '/Users/jasonbeedle/Desktop/snaviescraper/chromedriver'
options = Options()
options.page_load_strategy = 'normal'
# Navigate to url
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get("http://best4sport.tv/2hd/2020-12-10/")
options.add_argument("--window-size=1920x1080")
results = driver.find_element_by_class_name('program1_content_container')
soup = BeautifulSoup(results.text, 'html.parser')
# results = driver.find_element_by_class_name('program1_content_container')
p_data1 = soup.find_all("div", {"class_name": "program1_content_container"})
p_data2 = soup.find_all("div", {"class_name": "program_time"})
p_data3 = soup.find_all("div", {"class_name": "sport"})
p_data4 = soup.find_all("div", {"class": "program_text"})
print("Here is your data, I am off ot sleep now see ya ")
print(results.text)
# Create csv
programme_list = []
# Programme List
for item in p_data1:
try:
name = item.contents[1].find_all(
"div", {"class": "program1_content_container"})[0].text
except:
name = ''
p_data1 = [time]
programme_list.append(p_data1)
# Programme Time
for item in p_data2:
try:
time = item.contents[1].find_all(
"div", {"class": "program_time"})[0].text
except:
time = ''
p_data2 = [time]
programme_list.append(p_data2)
# Which sport
for item in p_data3:
try:
time = item.contents[1].find_all(
"div", {"class": "sport"})[0].text
except:
time = ''
p_data3 = [time]
programme_list.append(p_data3)
with open('sport.csv', 'w') as file:
writer = csv.writer(file)
for row in programme_list:
writer.writerow(row)
I have just tried to add an object called data_output Then I tried to print the data_output
data_output = [p_data1, p_data2, p_data3, p_data4]
...
print(data_output)
The output in the terminal is
Load data into pandas dataframe and export into csv.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
DRIVER_PATH = '/Users/jasonbeedle/Desktop/snaviescraper/chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.get("http://best4sport.tv/2hd/2020-12-10/")
results =WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,".program1_content_container")))
soup = BeautifulSoup(results.get_attribute("outerHTML"), 'html.parser')
program_time=[]
sport=[]
program_text=[]
program_info=[]
for item in soup.select(".program_details "):
if item.find_next(class_='program_time'):
program_time.append(item.find_next(class_='program_time').text.strip())
else:
program_time.append("Nan")
if item.find_next(class_='sport'):
sport.append(item.find_next(class_='sport').text.strip())
else:
sport.append("Nan")
if item.find_next(class_='program_text'):
program_text.append(item.find_next(class_='program_text').text.strip())
else:
program_text.append("Nan")
if item.find_next(class_='program_info'):
program_info.append(item.find_next(class_='program_info').text.strip())
else:
program_info.append("Nan")
df=pd.DataFrame({"program_time":program_time,"sport":sport,"program_text":program_text,"program_info":program_info})
print(df)
df.to_csv("sport.csv")
csv snapshot after creation
If you don't have pandas then you need to install it.
pip install pandas
As Blue Fishy said you can try to change to w mode only, but you may run in an encoding error.
Solution that works on your data
import csv
programme_list = ['19:55','MOTORU SPORTS','Motoru sporta "5 minūte"','Iknedēļas Alda Putniņa veidots apskats par motoru sportu','20:00','BASKETBOLS','...']
with open('sport.csv', 'w', encoding='utf-8') as file:
writer = csv.writer(file, delimiter=',', lineterminator='\n')
for row in programme_list:
print(row)
writer.writerow([row])
Output
19:55
MOTORU SPORTS
"Motoru sporta ""5 minūte"""
Iknedēļas Alda Putniņa veidots apskats par motoru sportu
20:00
BASKETBOLS
...
Instead of writing binary, can you try changing wb to w?
Change
with open('sport.csv', 'wb') as file:
to
with open('sport.csv', 'w') as file:
EDITED:
Sorry for being a bit late. Here is the code modified based on your original code FYI.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import time
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
DRIVER_PATH = '/Users/jasonbeedle/Desktop/snaviescraper/chromedriver'
options = Options()
options.page_load_strategy = 'normal'
# Navigate to url
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get("http://best4sport.tv/2hd/2020-12-10/")
options.add_argument("--window-size=1920x1080")
results = driver.find_element_by_class_name('program1_content_container')
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
# results = driver.find_element_by_class_name('program1_content_container')
p_data1 = soup.find_all("p", {"class": "program_info"})
p_data2 = soup.find_all("p", {"class": "program_time"})
p_data3 = soup.find_all("p", {"class": "sport"})
p_data4 = soup.find_all("p", {"class": "program_text"})
# Create csv
programme_list = []
# Programme List
for i in range(len(p_data1)):
programme_list.append([p_data1[i].text.strip(), p_data2[i].text.strip(), p_data3[i].text.strip(), p_data4[i].text.strip()])
with open('sport.csv', 'w', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["program_info", "program_time", "sport", "program_text"])
for row in programme_list:
writer.writerow(row)
Excel Screenshot here
this is my code below to web scrape the tesco website, which I cant get to work, no error codes are showing but its also not creating a .csv file. any help would be appreciated.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
import csv
def save_csv(dct):
name = "tescowhiskey.csv"
print("[INFO] saving...")
with open(name, 'a', encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow((dct['cap'],
dct['Whiskey'],
dct['TESCO'],
))
def scroll(driver):
for _ in range(1,6):
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.execute_script("window.scrollTo(0, 1000)")
time.sleep(7)
driver = webdriver.Firefox()
driver.get("https://www.tesco.com/groceries/en-GB/shop/drinks/spirits/whisky")
for i in range(2):
element = WebDriverWait(driver, 30)
scroll(driver)
data = driver.find_elements_by_css_selector("product-list-container")
for d in data:
items = {}
body = d.text.split("\n")
items["cap"] = body[0]
items["Whiskey"] = body[1]
items["TESCO"] = body[-3]
save_csv(items)
driver.find_element_by_css_selector(".prev-next").click()
driver.quit()
Hi I am a code newbie an I am trying to get news titles from cnn.com just like the image of an excel file attached below.
However the problem is, that I don't know how to add each columns, such as World/Politics/Health and my code get data only from LAST element of the tuple list (in this code, 'politics').
So here is my code. Thank you in advance!
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import os
from bs4 import BeautifulSoup as soup
from bs4 import NavigableString
import re
import xlsxwriter
from openpyxl import Workbook
path = "C:/Users/Desktop/chromedriver.exe"
driver = webdriver.Chrome(path)
# per section
a =['world','health','politics']
wb = Workbook()
ws = wb.active
for i in a:
nl = []
driver.get("https://edition.cnn.com/"+str(i))
driver.implicitly_wait(3)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
find_ingre = soup.select("span.cd__headline-text")
for i in find_ingre:
nl.append(i.get_text())
# make dataframe --> save xlsx
import pandas as pd
from pandas import Series, DataFrame
df = pd.DataFrame(nl)
df.to_excel("cnn_recent_topics.xlsx",index=False)
result now --->
result that I want to get --->
Could you try this, comment if you need explaination:
def custom_scrape(topic):
nl = []
driver.get("https://edition.cnn.com/"+str(topic))
driver.implicitly_wait(3)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
find_ingre = soup.select("span.cd__headline-text")
for i in find_ingre:
nl.append(i.get_text())
return nl
topics =['world','health','politics']
result = pd.DataFrame()
for topic in topics:
temp_df = pd.DataFrame(nl)
temp_df.columns = [topic]
result = pd.concat([result, temp_df], ignore_index=True, axis=1)
I want to retrieve the match data from the following website:
https://understat.com/match/81
I wrote the following script:
import sys
import time
import os
import io
import csv
from selenium import webdriver
import selenium.webdriver.support.expected_conditions as ec
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
driver = None
cnx = None
currentDir = sys.path[0]
def scrap_understat():
init_browser('firefox')
for i in range(80, 10080):
try:
driver.get('https://understat.com/match/' + str(i))
time.sleep(1)
if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:
continue
data = get_match_data()
save_data(data)
except Exception as ex:
log_this(ex)
print(str(ex))
close_browser()
def get_match_data():
data = []
teams = driver.find_elements(By.NAME, 'team')
for team in teams:
team.find_element(By.XPATH, 'following-sibling::*').click()
time.sleep(1)
players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
for player in players:
tds = player.find_elements(By.TAG_NAME, 'td')
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]
data.append(record)
return data
def save_data(data):
# save CSV
csv_file_path = currentDir + '/output' + current_time + '.csv'
file = None
writer = None
if not os.path.exists(csv_file_path):
file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
writer.writerow(
['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',
'xA'])
else:
file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
for record in data:
writer.writerow(record)
file.close()
The output of my script looks like this:
So, there is a problem with the xG- and xA-columns. I only want the lowerscript-part whereas the script takes all the text within the td How do I change my script to only include the first part? By inspecting the page elements, I see that the undesired part is called sub-class
Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
Try this one to avoid matching sub text:
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text,
driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]
To get teamnames you can use
home = driver.find_element_by_xpath('//label[#for="team-home"]').text
away = driver.find_element_by_xpath('//label[#for="team-away"]').text
P.S. Consider to use Waits instead of time.sleep
It looks like you just need to remove the sup's:
driver.execute_script("$('sup').remove()")