Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you.these is the have already searched for an answer here and spent a long time on google, but nothing... I've already tried opening the file with 'w' instead of 'r' or 'a' but I still can't get my code to
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
url='https://mergr.com/login'
driver.get(url)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
for page in range(1,3):
URL = 'https://mergr.com/firms/search/employees?page={page}&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'.format(page=page)
driver.get(URL)
added_urls = []
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')
Currently, you're clearing the product list at the beginning of each page loop - either move the product=[] line to before for page in range(1,3) OR indent the last two lines [with append mode - df.to_csv('firm.csv', mode='a' )] to get then inside the page loop; i.e., the product=[] line and the df... lines should have the SAME indent level.
(I don't recommend append mode btw, it's a bit annoying - if you use header=False, you won't have any headers [unless you write extra code to initialize the csv with them, like in saveScrawlSess in this crawler ], but if you don't then the header row keeps getting repeated every few rows....)
Related
Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you. these is the page link https://www.askgamblers.com/online-casinos/countries/ca/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/ca/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for url in urls:
driver.get(url)
time.sleep(1)
try:
name=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
try:
company=driver.find_element(By.XPATH,"//p[span[contains(.,'Company')]]/following-sibling::div").text
except:
pass
try:
link=driver.find_element(By.XPATH,"//p[span[contains(.,'Website')]]/following-sibling::div").text
except:
pass
try:
rate=driver.find_element(By.CSS_SELECTOR,"span.rating-ring__number").text
except:
pass
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)
You open the same file for (over)writing with 'w' each time but loop over 3 pages. Use a different name or use 'a' (append) instead, but you will get the header three times as well with the current configuration.
Better would be to open the file for writing outside the for page loop, write the header, then inside for page write the rows.
Basically:
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for page in range(1,3):
... # compute the row info
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)
I am trying to webscrape various pages of results. The first page works fine but when I switch to the next page, unfortunately,it just webscrapes the first page of results again. The results dont return a new URL so that way doesn't work but rather its a window on top of the url opened page. I also cant seem to figure out how to append the results of the first page to add the second page, they come out as separate lists. Below is the code I have.
from selenium import webdriver
import time
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
#original webscraping code to get the names of locations from page 1
url = r'https://autochek.africa/en/ng/fix-your-car/service/scheduled-car-service'
driver = webdriver.Chrome()
driver.get(url)
xpath_get_locations = r'/html/body/div[1]/div/div[2]/div/div[1]/div/div[2]/div[2]/div/div/form/div[7]/div/label'
driver.find_element_by_xpath(xpath_get_locations).click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
location_results = [i.text for i in soup.find_all('div', {'class': 'jsx-1642469937 state'})]
print(location_results)
time.sleep(3)
#finished page 1, finding the next button to go to page 2
xpath_find_next_button = r'/html/body/div[1]/div/div[2]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/div/div/div[3]/ul/li[13]'
driver.find_element_by_xpath(xpath_find_next_button).click()
#getting the locations from page 2
second_page_results = [i.text for i in soup.find_all('div', {'class': 'jsx-1642469937 state'})]
print(second_page_results)
time.sleep(2)
After loading new page or running some JavaScript code on page you have to run again
soup = BeautifulSoup(driver.page_source, 'html.parser')
to work with new HTML.
Or skip BeautifulSoup and do all in Selenium.
Use find_elements_... with char s in word elements.
items = driver.find_elements_by_xpath('//div[#class="jsx-1642469937 state"]')
location_result = [i.text for i in items]
By The Way:
(xpath doesn't need prefix r because it doesn't use \ )
Shorter and more readable xpath.
#xpath_get_locations = r'/html/body/div[1]/div/div[2]/div/div[1]/div/div[2]/div[2]/div/div/form/div[7]/div/label'
xpath_get_locations = '//label[text()="Drop-off at Autochek location"]'
And it would be simpler to use button Next > instead of searching buttons 2, 3, etc.
xpath_find_next_button = '//li[#class="next-li"]/a'
EDIT:
Full working code which uses while-loop to visit all pages.
I added module webdriver_manager which automatically downloads (fresh) driver for browser.
I use find_elemens(By.XPATH, ...) because find_elemens_by_xpath(...) is deprecated.
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
#from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
#from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
#driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
# ---
url = 'https://autochek.africa/en/ng/fix-your-car/service/scheduled-car-service'
driver.get(url)
#xpath_get_locations = r'/html/body/div[1]/div/div[2]/div/div[1]/div/div[2]/div[2]/div/div/form/div[7]/div/label'
xpath_get_locations = '//label[text()="Drop-off at Autochek location"]'
driver.find_element(By.XPATH, xpath_get_locations).click()
# ---
all_locations = []
while True:
# --- get locations on page
time.sleep(1) # sometimes `JavaScript` may need time to add new items (and you can't catch it with `WebDriverWait`)
#items = soup.find_all('div', {'class': 'jsx-1642469937 state'})
items = driver.find_elements(By.XPATH, '//div[#class="jsx-1642469937 state"]')
#soup = BeautifulSoup(driver.page_source, 'html.parser')
locations = [i.text for i in items]
print(locations)
print('-------')
all_locations += locations
# --- find button `next >` and try to click it
#xpath_find_next_button = r'/html/body/div[1]/div/div[2]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/div/div/div[3]/ul/li[13]'
xpath_find_next_button = '//li[#class="next-li"]/a'
try:
driver.find_element(By.XPATH, xpath_find_next_button).click()
except:
break # exit loop
# ---
#driver.close()
Here is my full code. I want to get output data on csv such as title, price everything will be separated column on csv or excel spreadsheet. My code going to details page of each product and collecting my necessary information such as product title, price etc.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
#argument for incognito Chrome
option = Options()
option.add_argument("--incognito")
browser = webdriver.Chrome(options=option)
browser.get("https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn")
# Wait 20 seconds for page to load
timeout = 20
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//div[#class='c16H9d']")))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
//getting link of each product
soup = BeautifulSoup(browser.page_source, "html.parser")
product_items = soup.find_all("div", attrs={"data-qa-locator": "product-item"})
for item in product_items:
item_url = f"https:{item.find('a')['href']}"
print(item_url)
browser.get(item_url)
//scrape details page information
itm_soup = BeautifulSoup(browser.page_source, "html.parser")
container_box = itm_soup.find_all("div",{"id":"container"})
# Use the itm_soup to find details about the item from its url.
for itm in container_box:
product_title_element = itm.find("span",class_="pdp-mod-product-badge-title")
product_title = product_title_element.get_text() if product_title_element else "No title"
print(product_title)
browser.quit()
how I will get product title on csv or excel spreadsheet ?
You can use the csv writer module to accomplish this.
from csv import writer
def AddToCSV(List):
with open("Output.csv", "a+", newline='') as output_file:
csv_writer = writer(output_file)
csv_writer.writerow(List)
# this can be used within your for loop
row_list = [item_url, product_title, price, etc..]
AddToCSV(row_list)
I am learning web crawling and scraping in python. I want to scrape data where in a site there are links, and inside those links there are more links. So I want to scrape data till predefined level n.
This is my basic code
import requests
from selenium import webdriver
from requests_ntlm import HttpNtlmAuth
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from webdrivermanager import GeckoDriverManager
import pickle
from selenium.webdriver.common.keys import Keys
from urllib.parse import urljoin
from seleniumrequests import Chrome
options = Options()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--profile-directory=Default")
driver = webdriver.Chrome("./chromedriver",options=options)
web_url = 'https://spaceflightnow.com/'
driver.get("https://spaceflightnow.com/")
time.sleep(5)
soup = BeautifulSoup(driver.page_source,"lxml")
#section = soup.section
links=[]
for url in soup.find_all('a',href=True):
links.append(urljoin(web_url,url.get('href')))
#print(urljoin(web_url,url.get('href')))
links = list(filter(lambda x: x != web_url,links))
print(links)
This prints multiple links of first page. Now I want to click and go to all the links in subsequent level and scrape it again,getting more links inside. There is the possibility of same links getting displayed again internally from news feed. So what I want to know is what should be my approach to do it. I can understand I need a tree, but cannot figure out exactly how ?
Like I create a list inside list, but how to do it dynamically till n level ? and how to map it with the data saved in file ?? Can anyone help me with this ? maybe with a sample solution ?
Thank you :)
I made example which work without recursion - I would say it is similar to Breadth-First Search algorithm.
It keeps urls on list [(url, level),...] to control level and in set() to filter visited page. It also filters links to external pages.
Tested with Firefox.
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# ---
def get_links(driver, url):
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source,"lxml")
links = []
for new_url in soup.find_all('a', href=True):
new_url = new_url.get('href')
new_url = urljoin(url, new_url)
links.append(new_url)
return links
# ---
options = Options()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--profile-directory=Default")
driver = webdriver.Chrome("./chromedriver",options=options)
#driver = webdriver.Firefox()
# ---
domain = 'https://spaceflightnow.com/' # to filter external links
start_url = 'https://spaceflightnow.com/'
max_level = 2
links_visited = set([start_url]) # to test visited links
links_with_levels = [(start_url, 0)] # to control levels
# ---
for link, level in links_with_levels:
if level >= max_level:
print('skip:', level, link)
continue
print('visit:', level, link)
links = get_links(driver, link)
print('found:', len(links))
links = list(set(links) - links_visited)
print('after filtering:', len(links))
level += 1
for new_link in links:
if new_link.startswith(domain): # filter external links
links_visited.add(new_link)
links_with_levels.append( (new_link, level) )
# ---
for link, level in links_with_levels:
print('skip:', level, link)
I am learning python by trying to solve problems.
When I attempt to access an element after logging in to the site, the same command works in shell and doesn't work if its in the following file.
Plus I think my approach is wrong as the element keeps changing its id and the only constant is the "More Search Results" for which I have tried: find_link_by_text which fails, I assume because the element doesn't contain an href. find_link_by_xpath with contains text.
Webscraping:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import requests, bs4, re, csv
chrome_path = r"C:\Users\-----\Desktop\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get("https://dir.indiamart.com/search.mp? ss=Power+Distribution+Transformers")
driver.maximize_window()
time.sleep(10) #setting a gap for website load
action = webdriver.ActionChains(driver)
elm = driver.find_element_by_id("user_sign_in").click()
inputElement = driver.find_element_by_id('email')
inputElement.send_keys('xxxxxx')
driver.find_element_by_name("Submit3").send_keys(Keys.RETURN)
time.sleep(30)
#The code till above this is working perfectly
# element:
#<div id="scroll2" class="fm2 p8 cur m_bt2"
#onclick="javascript:displayResultsLogin('scroll2')"> Show More Results
# </div>
try:
driver.find_element_by_id("scroll2").click()
#Trying the the above find_element_* works if I input it in shell.
except:
print("Didn't work")
pass
# If I leave it in the file, removing the except, it shows element not found
r = driver.page_source
soup = bs4.BeautifulSoup(r, 'html.parser')
blocks = soup.find_all('div', class_='lst')
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
for b in blocks:
name = b.find(class_='cnm').get_text(strip=True)
addr = b.find(class_='clg').get_text(strip=True)
call = b.find(class_='ls_co phn').find(text=re.compile('\d+')).strip()
writer.writerow([name, addr, call])
The last part for some reason when in this file, will only add the 0 from the element into the file and not the xxxxxxxx number.
It works in the shell and does not when you run it in a script - this suggests it is a timing issue. In the shell, you have delays between each of the commands allowing the page to load, in the script you don't. The problem can be solved with WebDriverWait and one of the Expected Conditions:
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.ID, "scroll2"))).click()
# or try locating the element by text
# wait.until(EC.element_to_be_clickable((By.XPATH, "//*[contains(., 'Show More Results')]"))).click()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".lst")))
r = driver.page_source
soup = bs4.BeautifulSoup(r, 'html.parser')