Write data in csv file but data are overwritten - python

Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you. these is the page link https://www.askgamblers.com/online-casinos/countries/ca/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/ca/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for url in urls:
driver.get(url)
time.sleep(1)
try:
name=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
try:
company=driver.find_element(By.XPATH,"//p[span[contains(.,'Company')]]/following-sibling::div").text
except:
pass
try:
link=driver.find_element(By.XPATH,"//p[span[contains(.,'Website')]]/following-sibling::div").text
except:
pass
try:
rate=driver.find_element(By.CSS_SELECTOR,"span.rating-ring__number").text
except:
pass
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)

You open the same file for (over)writing with 'w' each time but loop over 3 pages. Use a different name or use 'a' (append) instead, but you will get the header three times as well with the current configuration.
Better would be to open the file for writing outside the for page loop, write the header, then inside for page write the rows.
Basically:
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for page in range(1,3):
... # compute the row info
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)

Related

Read csv file from url

I want to read url but they give me error that URL must be a string is there any possible solution for that kindly recommend me how to make the URL string kindly suggest any method
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
with open("data.csv") as file:
start_urls=[line.strip() for line in file]
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(start_urls)
As commented by #JeJe,
you need to iterate through your start_urls list
start_urls is a LIST of strings, and driver.get expects just one url as string - something like
# driver.get(start_urls) # NOT like this
for sUrl in start_urls:
driver.get(sUrl)
### SCRAPE AS NEEDED ###
or, if you want to keep track of progress, something like
suLen = len(start_urls)
for si, sUrl in enumerate(start_urls):
print(f'[{si} of {suLen}] reading {sUrl}')
driver.get(sUrl)
### SCRAPE AS NEEDED ###
Btw, you don't need to do everything under with open... - getting start_urls is enough:
with open("data.csv") as file:
start_urls=[line.strip() for line in file]
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
suLen = len(start_urls)
for si, sUrl in enumerate(start_urls):
print(f'[{si} of {suLen}] reading {sUrl}')
driver.get(sUrl)
### SCRAPE AS NEEDED ###
but you can only use just start_urls=[line.strip() for line in file] if you csv has only on column with no headers.
you could rewrite your code using functions to deal with each step of your code.
def read_file(has_header:bool=True):
with open("data.csv") as file:
start_urls = [line.strip() for line in file]
# assumes there is a header
if has_header:
return start_urls[1:]
else:
return start_urls
def scrap_urls():
urls = read_file(has_header=True)
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
for url in urls:
driver.get(url)
if __name__ == '__main__':
scrap_urls()

Data are overwritten using beautifulsoup

Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you.these is the have already searched for an answer here and spent a long time on google, but nothing... I've already tried opening the file with 'w' instead of 'r' or 'a' but I still can't get my code to
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
url='https://mergr.com/login'
driver.get(url)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
for page in range(1,3):
URL = 'https://mergr.com/firms/search/employees?page={page}&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'.format(page=page)
driver.get(URL)
added_urls = []
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')
Currently, you're clearing the product list at the beginning of each page loop - either move the product=[] line to before for page in range(1,3) OR indent the last two lines [with append mode - df.to_csv('firm.csv', mode='a' )] to get then inside the page loop; i.e., the product=[] line and the df... lines should have the SAME indent level.
(I don't recommend append mode btw, it's a bit annoying - if you use header=False, you won't have any headers [unless you write extra code to initialize the csv with them, like in saveScrawlSess in this crawler ], but if you don't then the header row keeps getting repeated every few rows....)

get string from pandas column shorter than real

I have a df like this:
Sucursal
url
3
https://www.google.com/maps/place/Moov/#-34.4718341,-58.5170394,17z/data=!3m1!4b1!4m5!3m4!1s0x95bcb1c7fd6203c3:0x37a2af54c5fe1a!8m2!3d-34.4718341!4d-58.5148507
11
https://www.google.com/maps/place/Moov/#-34.6496456,-58.6220989,17z/data=!3m1!4b1!4m5!3m4!1s0x95bcc7612176b9a3:0x2c6fcf2164a536b9!8m2!3d-34.6496453!4d-58.6199217
(Real df has more than 100 rows)
I´m using Selenium to get some reviews from google, so I need to loop the urls from "url" column.
This is the first part of the script :
for i in (range(0,2)): #I´m testing with some elements to see if it works
driver = webdriver.Chrome()
url=str(df.iloc[[i]]['url']).split()[1]
driver.get(url)
time.sleep(5)
driver.find_element_by_xpath('//*[#id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/div[2]/span[2]/span[1]/button').click()
time.sleep(1)
and so on
the problem is when I get the driver, the url pasted in the rendered page is : "https://www.google.com/maps/place/Moov/#-34.4718341,-58.5170394,17z"
It happens the same with all urls strings...I don´t know why it´s not getting 100% of the string.
If I print as value:
print(df['url'].values)
The url string is ok...but not when it opens the chrome driver...
What am I doing wrong?
Thanks in advance!
I used tsv file to generate df based on your data
import contextlib
import pandas
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
# path to your chromedriver
service = Service(executable_path='your-path')
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 10)
df = pandas.read_csv('grev.tsv', sep='\t')
# accept cookies
driver.get('https://www.google.com/maps')
with contextlib.suppress(TimeoutException):
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'form[action]:nth-child(2)'))).click()
# open links from df and get reviews
for url in df['url']:
driver.get(url)
# click on the "More" button for reviews
driver.execute_script("for (let elem of document.querySelectorAll('jsl>button')) {elem.click()}")
# get the first 3 reviews
reviews = driver.find_elements(By.CSS_SELECTOR, 'div[data-review-id]:first-child')
for review in reviews:
print(review.find_element(By.CSS_SELECTOR, 'div[class="MyEned"]').text)
driver.close()

Scraping with Selenium. for page in range() issue

I am trying to scrape on ethplorer.io. I want to scrape many pages. My code is like this. But it scrapes page(11) three times. range(11,14) I couldn't understand why?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
base_url = "https://ethplorer.io/tr/address/0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d#pageSize=100&tab=tab-holders&holders="
results=[]
for page_number in range(11,14):
url = base_url+str(page_number)
driver.get(url)
data = driver.find_elements(By.CLASS_NAME, "local-link")
for x in data:
results.append(x.text)
driver.quit
with open("all_data.txt" , "w") as file:
for x in results:
file.write(x + "\n")
I have applied some modifications to your code to have it working through the several pages you are calling and to capture the text within the hyperlinks I assume you are targeting, please check below:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
url = "https://ethplorer.io/tr/address/0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d#pageSize=100&tab=tab-holders&holders=11" # Here we are calling the first page we need, which is page #11 in this case.
xpath = "//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]"
driver.get(url)
data =driver.find_elements_by_xpath("//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]")
results=[]
for page_number in range(12,15): # Range should start from the next page (page # 12 in this case). Range end with last page you need + 2, in this case you need to scrape from 11 untill 13 ,so rage end should be 15.
for x in data: # On first round it will get page # 11 data.
results.append(x.text)
nextPage = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"//div[#id='token-holders-tab']//tr[contains(#class,'paginationFooter')]//A[#class='page-link'][text()='"+str(page_number)+"']")))
driver.execute_script("arguments[0].click();", nextPage)
time.sleep(3)
data =driver.find_elements_by_xpath("//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]")
driver.quit
with open("all_data.txt" , "w") as file:
for x in results:
file.write(x + "\n")

bs4 again from website and save to text file

I am learning on how to extract data from websites now and have managed to get alot of information. However for my next website I am failing for some unknown reason as nothing is saved to the text files nor do I get any output in print. Here is my piece of code:
import json
import urllib.request
from bs4 import BeautifulSoup
import requests
url = 'https://www.jaffari.org/'
request = urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(request)
html = response.read()
soup = BeautifulSoup(html.decode("utf-8"), "html.parser")
table = soup.find('div', attrs={"class":"textwidget"})
name = table.text.encode('utf-8').strip()
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name, outfile)
print (name)
Can anyone help please?
The prayer times are rendered by java-scripts therefore you need to use browser tool like selenium to load the page and then use beautiful soup to get the data.
You need to download compatible ChromeDriver from this link and passed the chrome driver path as i have provided.
Code here to fetch name and prayer times and saved in a text file.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
with open('testPrayers.txt', 'w') as outfile:
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
outfile.write(name + " " + time[0] + "\n")
print(name + " " + time[0])
outfile.close()
print('Done')
Updated data with different file name.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
print(name + " " + time[0])
with open(name+'.txt', 'w') as outfile:
outfile.write(time[0])
outfile.close()
print('Done')
The name variable needs to be a string rather than a bytes object. Try with
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name.decode(), outfile)
print (name.decode())
Hope it helps.

Categories

Resources