Cleaning up DataFrame output | Selenium | Python - python

I've got a script set up to loop through a group of URLs. The script runs fine, but I can't figure out how to tweak things to produce a cleaner CSV output.
I'll take any suggestions I can to minimize the time needed to clean up the formatting, delete excel cells, and the like.
Note: The way I'm scraping the volume text has been the only way I've figured out how to get what I need. Hopefully, we can find a good solution for improving the final output without compromising this part of the script.
Here's my script:
group_url = [
'https://www.example.com',
'https://www.example2.com',
'https://www.example3.com',
'https://www.example4.com',
]
data = []
for group in group_url:
driver.get(group)
wait = WebDriverWait(driver, 90)
element = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')))
time.sleep(3)
kws = driver.find_elements_by_css_selector(".css-hijzdp-base")
counter = 1
for kw in kws:
if counter <= 5:
try:
data.append({
"Keyword": kw.text
})
counter = counter + 1
except NoSuchElementException:
pass
urls = driver.find_elements_by_css_selector(".css-a5m6co-text.css-p8ym46-fontFamily.css-11397xj-fontSize.css-18j1nfb-display")
count = 1
for url in urls:
if count <= 5:
try:
data.append({
"URL": url.text
})
count = count + 1
except NoSuchElementException:
pass
try:
vol1 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume1": vol1.text
})
try:
vol2 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[2]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume2": vol2.text
})
try:
vol3 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[3]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume3": vol3.text
})
try:
vol4 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[4]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume4": vol4.text
})
try:
vol5 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[5]/td[6]/div')
except NoSuchElementException:
pass
else:
data.append({
"Volume5": vol5.text
})
driver.close()
print(data)
#print to csv
df = pd.DataFrame(data)
df.to_csv('testOutput 11_11_21.csv')
Here is a screenshot of the final output:

You're appending every row item independently to data. Collect them first in a dictionary within the for loop, then append the dictionary to the list data:
group_url = [
'https://www.example.com',
'https://www.example2.com',
'https://www.example3.com',
'https://www.example4.com',
]
data = []
for group in group_url:
tmp_dict = {}
driver.get(group)
wait = WebDriverWait(driver, 90)
element = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')))
time.sleep(3)
kws = driver.find_elements_by_css_selector(".css-hijzdp-base")
counter = 1
keywords = []
for kw in kws:
if counter <= 5:
try:
keywords.append(kw.text)
counter = counter + 1
except NoSuchElementException:
pass
tmp_dict["Keyword"] = keywords
urls = driver.find_elements_by_css_selector(".css-a5m6co-text.css-p8ym46-fontFamily.css-11397xj-fontSize.css-18j1nfb-display")
count = 1
urls_results = []
for url in urls:
if count <= 5:
try:
urls_results.append(url.text)
count = count + 1
except NoSuchElementException:
pass
tmp_dict["URL"] = urls_results
try:
vol1 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[1]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume1"]= vol1.text
try:
vol2 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[2]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume2"]= vol2.text
try:
vol3 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[3]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume3"]= vol3.text
try:
vol4 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[4]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume4"]= vol4.text
try:
vol5 = driver.find_element_by_xpath('//*[#id="root"]/div/div[2]/main/div/div[3]/div[1]/div[2]/div/table/tbody/tr[5]/td[6]/div')
except NoSuchElementException:
pass
else:
tmp_dict["Volume5"]= vol5.text
data.append(tmp_dict)
driver.close()
print(data)
#print to csv
df = pd.DataFrame(data)
df.to_csv('testOutput 11_11_21.csv')

Related

Adding to list from a fo loop

I got most of my code working but have a lingering question. This is not my full code below but for the interest of readability I selected only a portion. I'm scraping a list of URLs from a web page (in imgs2) and then scraping info from the list of URLs. I would like to create a second list of URLs based on the results gathered in the first. (see img_url2 below). What happens is that instead of appending every new url to the list, it just replaces the previous one with the new one. Any idea how to have all of them be added to the list?
driver.get("https://superrare.com/market?market-options=%257B%2522first%2522%3A30%2C%2522orderBy%2522%3A%2522RECENT_NFT_EVENT_BY_TOKEN_CONTRACT_ADDRESS_AND_TOKEN_ID__TIMESTAMP_DESC%2522%2C%2522fileTypes%2522%3A%255B%2522image%2Fjpeg%2522%2C%2522image%2Fpng%2522%255D%2C%2522listPrice%2522%3Afalse%2C%2522isGenesis%2522%3Afalse%2C%2522isSeries%2522%3Afalse%2C%2522neverReceivedOffer%2522%3Afalse%2C%2522reservePrice%2522%3Afalse%2C%2522liveAuctions%2522%3Afalse%2C%2522upcomingAuctions%2522%3Afalse%2C%2522hasSold%2522%3Afalse%2C%2522ownedByCreator%2522%3Afalse%2C%2522openOffers%2522%3Afalse%2C%2522artistsCollected%2522%3Afalse%2C%2522artistsYouFollow%2522%3Afalse%2C%2522artistsThatFollowYou%2522%3Afalse%2C%2522artistsFollowedByFollowed%2522%3Afalse%2C%2522lowerPriceRange%2522%3A0%2C%2522upperPriceRange%2522%3A100000%2C%2522numCreatorSales%2522%3Afalse%2C%2522lowerMintedRange%2522%3Anull%2C%2522upperMintedRange%2522%3Anull%2C%2522startCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjExOjMyKzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwxNzYzMF1d%2522%2C%2522endCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE2VDIwOjMxOjUxKzAwOjAwIiwiMHg0MjQyMzk5YzE2Yjc4MzgxOTZlZDMzZjE3OWU5OWUzZjk5Yjg4NGYyIiwzXV0%3D%2522%2C%2522lastEndCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjMwOjI3KzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwyNzgxNl1d%2522%2C%2522lastStartCursor%2522%3Afalse%2C%2522hasPreviousPage%2522%3Atrue%2C%2522hasNextPage%2522%3Atrue%2C%2522reverse%2522%3Afalse%257D")
imgs2 = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(#class,'Name-sc-7kf6vz-3')]")))
time.sleep(5)
for i in range(0,30):
img_url = []
for number, item in enumerate(imgs2, 1):
imgwors2 = item.get_attribute("href")
driver3 = webdriver.Chrome()
driver3.get(imgwors2)
def check_exists_by_xpath(xpath):
try:
WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))
except TimeoutException:
return False
return True
if check_exists_by_xpath("//h1[#class='collectible-detail__collectible-name']"):
imgsrc4 = WebDriverWait(driver3,65).until(EC.presence_of_all_elements_located((By.XPATH, "//h1[contains(#class,'collectible-detail__collectible-name')]")))
for i in imgsrc4:
title = i.text
else:
title = "none"
print(title)
img_url2 = []
imgsrc2 = WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, "//p[#data-testid='artistName']/ancestor::a[contains(#class,'ChildrenLink')]")))
for i in imgsrc2:
biourl = i.get_attribute("href")
img_url2.append(biourl)
print(img_url2)
driver.close()
I think from your description and code, the variable img_url2 should be initialized before the for loop(s)
driver.get("https://superrare.com/market?market-options=%257B%2522first%2522%3A30%2C%2522orderBy%2522%3A%2522RECENT_NFT_EVENT_BY_TOKEN_CONTRACT_ADDRESS_AND_TOKEN_ID__TIMESTAMP_DESC%2522%2C%2522fileTypes%2522%3A%255B%2522image%2Fjpeg%2522%2C%2522image%2Fpng%2522%255D%2C%2522listPrice%2522%3Afalse%2C%2522isGenesis%2522%3Afalse%2C%2522isSeries%2522%3Afalse%2C%2522neverReceivedOffer%2522%3Afalse%2C%2522reservePrice%2522%3Afalse%2C%2522liveAuctions%2522%3Afalse%2C%2522upcomingAuctions%2522%3Afalse%2C%2522hasSold%2522%3Afalse%2C%2522ownedByCreator%2522%3Afalse%2C%2522openOffers%2522%3Afalse%2C%2522artistsCollected%2522%3Afalse%2C%2522artistsYouFollow%2522%3Afalse%2C%2522artistsThatFollowYou%2522%3Afalse%2C%2522artistsFollowedByFollowed%2522%3Afalse%2C%2522lowerPriceRange%2522%3A0%2C%2522upperPriceRange%2522%3A100000%2C%2522numCreatorSales%2522%3Afalse%2C%2522lowerMintedRange%2522%3Anull%2C%2522upperMintedRange%2522%3Anull%2C%2522startCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjExOjMyKzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwxNzYzMF1d%2522%2C%2522endCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE2VDIwOjMxOjUxKzAwOjAwIiwiMHg0MjQyMzk5YzE2Yjc4MzgxOTZlZDMzZjE3OWU5OWUzZjk5Yjg4NGYyIiwzXV0%3D%2522%2C%2522lastEndCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjMwOjI3KzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwyNzgxNl1d%2522%2C%2522lastStartCursor%2522%3Afalse%2C%2522hasPreviousPage%2522%3Atrue%2C%2522hasNextPage%2522%3Atrue%2C%2522reverse%2522%3Afalse%257D")
imgs2 = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(#class,'Name-sc-7kf6vz-3')]")))
time.sleep(5)
img_url2 = [] # <--- moved before the loop
for i in range(0,30):
for number, item in enumerate(imgs2, 1):
imgwors2 = item.get_attribute("href")
driver3 = webdriver.Chrome()
driver3.get(imgwors2)
def check_exists_by_xpath(xpath):
try:
WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))
except TimeoutException:
return False
return True
if check_exists_by_xpath("//h1[#class='collectible-detail__collectible-name']"):
imgsrc4 = WebDriverWait(driver3,65).until(EC.presence_of_all_elements_located((By.XPATH, "//h1[contains(#class,'collectible-detail__collectible-name')]")))
for i in imgsrc4:
title = i.text
else:
title = "none"
print(title)
imgsrc2 = WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, "//p[#data-testid='artistName']/ancestor::a[contains(#class,'ChildrenLink')]")))
for i in imgsrc2:
biourl = i.get_attribute("href")
img_url2.append(biourl)
driver.close()
print(img_url2) # <--- moved below the loop

Trying to Scrape Instagram Post Data from .csv with links - For Masters Thesis

I am trying to scrape instagram post data (number of likes, Caption, Hashtags, Mentions and number of comments) from a collection of links in a .csv for data analysis to put towards my Masters Thesis. however i am coming across an error where the xpath or element cannot be found. Here is the error message:
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[#id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/div/button"}
Here is the code block i have written using selenium:
def scrape_post_data():
influencerpostsdata = []
# Specify the path to chromedriver.exe
chromedriver_path = r"C:\\Users\\stuar\\Instagram Scraper\\ChromeDrivers\chromedriver.exe"
driver = webdriver.Chrome(executable_path=chromedriver_path)
time.sleep(2)
# Open the webpage
url = "https://www.instagram.com"
driver.get(url)
time.sleep(3)
# Alert number 1
time.sleep(5)
alert = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accept All")]'))).click()
# Target Username Entry
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='password']")))
# Enter Username and Password
login_username = str(enter_loginusername_entry.get())
login_password = str(enter_password_entry.get())
username.clear()
username.send_keys(login_username)
password.clear()
password.send_keys(login_password)
button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()
# Alert number 2
time.sleep(5)
alert2 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
# Alert number 3
time.sleep(5)
alert3 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
with open(r"C:\\Users\\stuar\\Instagram Scraper\\SourceCode/influencerpostlinks1.csv",'r') as csv_file:
csv_reading = csv.reader(csv_file)
for line in csv_reading:
links = line[1]
try:
Page = driver.get(links)
except Exception as e:
Page = None
time.sleep(20)
try:
# This captures the standard like count.
likes = driver.find_element_by_xpath("""//*[#id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/div/button""").text.split()[0]
post_type = 'photo'
except:
# This captures the like count for videos which is stored
likes = driver.find_element_by_xpath("""//*[#id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/span""").text.split()[0]
post_type = 'video'
age = driver.find_element_by_css_selector('a time').text
comment = driver.find_element_by_xpath("""//*[#id="react-root"]/section/main/div/div/article/div[2]/div[1]/ul/div/li/div/div/div[2]/span""").text
hashtags = find_hashtags(comment)
mentions = find_mentions(comment)
post_details = {'link': url, 'type': post_type, 'likes/views': likes,
'age': age, 'comment': comment, 'hashtags': hashtags,
'mentions': mentions}
time.sleep(10)
#turning data into a .csv file
influencerpostsdata.append(post_details)
df = pd.DataFrame(influencerposts)
print(df)
df.to_csv('influencerpostsdata.csv')
driver.close()
Not To worry i have resolved the problem..
with open(r"C:\\Users\\stuar\\Instagram Scraper\\SourceCode/influencerpostlinks1.csv",'r') as csv_file:
csv_reading = csv.reader(csv_file)
for line in csv_reading:
links = line[1]
try:
Page = driver.get(links)
except Exception as e:
Page = None
time.sleep(20)
try:
likes = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/div[1]/article/div[3]/section[2]/div/div/a/span')
except Exception as e:
likes = None
try:
likes2 = likes.text
except Exception as e:
likes2 = None
time.sleep(20)
try:
age = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/div[1]/article/div[3]/div[2]/a/time')
except Exception as e:
age = None
try:
age2 = age.text
except Exception as e:
age2 = None
time.sleep(20)
try:
caption = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/div[1]/article/div[3]/div[1]/ul/div/li/div/div/div[2]/span')
except Exception as e:
caption = None
try:
caption2 = caption.text
except Exception as e:
caption2 = None
time.sleep(20)
try:
AccountName = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/div[1]/article/header/div[2]/div[1]/div/span/a')
except Exception as e:
AccountName = None
try:
AccountName2 = AccountName.text
except Exception as e:
AccountName2 = None
time.sleep(20)
post_details = {'Username': AccountName2,'Caption': caption2, 'Likes/Views': likes2,
'Age': age2 }
#turning data into a .csv file
influencerpostsdata.append(post_details)
df = pd.DataFrame(influencerpostsdata)
print(df)
df.to_csv('influencerpostsdata.csv')
driver.close()

ERROR selenium.common.exceptions.StaleElementReferenceException (he browser pop-up window does not close)

I can't close the pop-up window,I wrote a function that in fact should close the window, well, it does not close, and even more so even the exception does not work immediately throws the same error
I will be grateful for your help, I have already spent a large amount of time in different ways, it does not help, and I tried to find it through xpatch and through the class
#bot.message_handler(commands=['start'])
def start(message):
bot.send_message(message.chat.id, "Создаеться прайс")
b = 0
# Эльдорадо
#while True:
link_eldo = "https://www.mvideo.ru/product-list-page-cls?q=redmi&limit=12&region_id=1&category_id=cat2_cis_0000000357"
print (requests.get(link_eldo))
# requests_url = requests.get(link_eldo)
# if requests_url.status_code == 403:
# print("Получилось")
# options.add_argument(f"user-agent={user_agent.random}")
driver.get(link_eldo)
def func (i):
i = i + 1
return i
def close_banner(browser):
wait(browser, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "close"))).click()
#time.sleep(20)
name_eldo_product = driver.find_elements_by_xpath("/html/body/div[2]/div[1]/div[4]/div[2]/div[3]/div/div[1]/div[1]/div['func(0)']/div[3]/div/div[1]/h4/a")
price_eldo_product = driver.find_elements_by_class_name("fl-product-tile-price__current")
name_product = []
price_product = []
for name in name_eldo_product:
name_product.append(name.text)
for price in price_eldo_product:
price = price.text
price = re.sub('[\W_]+', '', price)
rub = "р"
for clear in rub:
price = price.replace(clear,"")
price_product.append(int(price))
print(price_product)
сreat_dic = dict(zip(name_product, price_product))
main_products = {}
main_products.update(сreat_dic)
print (main_products)
link_eldo = "https://www.mvideo.ru/product-list-page-cls?q=iphone12&limit=12&region_id=1&category_id=cat2_cis_0000000357"
print (requests.get(link_eldo))
# requests_url = requests.get(link_eldo)
# if requests_url.status_code == 403:
# print("Получилось")
# options.add_argument(f"user-agent={user_agent.random}")
driver.get(link_eldo)
try:
close_banner(driver)
name_eldo_product = driver.find_elements_by_xpath("/html/body/div[2]/div[1]/div[4]/div[2]/div[3]/div/div[1]/div[1]/div['func(0)']/div[3]/div/div[1]/h4/a")
price_eldo_product = driver.find_elements_by_class_name("fl-product-tile-price__current")
except NoSuchElementException:
#spelling error making this code not work as expected
pass
except StaleElementReferenceException:
close_banner(driver)
pass
except TimeoutException:
print('poput')

I can't export scraped data in CSV

I can't get all data in CSV, only last. When scraping is done only last one scraped is saving CSV file but I want to save from all pages.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
mainurl = 'https://austin.craigslist.org/search/cta?s=0'
driver.get(mainurl)
res = driver.execute_script("return document.documentElement.outerHTML")
page_soup = BeautifulSoup(res, 'html.parser')
lnk_opener = driver.find_element_by_xpath('//*[#id="sortable-results"]/ul/li[1]/p/a').click()
time.sleep(4)
records = []
i = 1
while i <3:
i+=1
try:
print(driver.current_url)
except Exception:
print('Internet Error Detected')
try:
title = driver.find_element_by_xpath('//*[#id="titletextonly"]').text
print(title)
except Exception:
print('No Title Given')
try:
price = driver.find_element_by_xpath('/html/body/section/section/h2/span/span[2]').text
print(price)
except Exception:
print('No Price Given')
try:
phone = driver.find_element_by_xpath('//*[#id="postingbody"]/h2[1]/big').text
print(phone)
records.append((phone))
except Exception:
print('No Mobile number avalible')
try:
loc = driver.find_element_by_xpath('/html/body/section/section/section/div[1]/div/div[2]').text
print(loc)
except Exception:
print('No Location Data Avalible')
try:
img = page_soup.find('img')
immg = print(img.get('src','\n'))
except Exception:
print('No img Found')
nxtpg = driver.find_element_by_xpath('/html/body/section/section/header/div[1]/div/a[3]')
nxtpg.click()
time.sleep(4)
url = driver.find_element_by_xpath("/html/body/section/section/header/div[1]/div/a[3]").get_attribute("href")
if url == None:
bckbtn = driver.find_element_by_class_name('backup').click()
time.sleep(5)
nextbuttton = driver.find_element_by_xpath('//*[#id="searchform"]/div[3]/div[3]/span[2]/a[3]').click()
time.sleep(6)
print(records)
records.append((driver.current_url, title, price, loc, immg))
df = pd.DataFrame(records, columns=['Product Url', 'Title/Model/Make', 'Price', 'GM Location', 'Image Link'])
print(df)
df.to_csv('zzz.csv')
time.sleep(4)
driver.quit()
I think this line
records.append((driver.current_url, title, price, loc, immg))
should be inside the while statement. Also, move i += 1 to the end of the statement, otherwise you're skipping the first iteration.

Dictionary python specific key

I have a code which scrape out everything from a specific web page, I now want to build a code which can help me to know the specific details, for example if I enter style id, it should give me the details related to it, or if I enter category, it should give me all the items in that category with their details. My code is:-
import requests, re
from bs4 import BeautifulSoup
url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2"
r=requests.get(url)
soup=BeautifulSoup(r.content)
links=soup.find_all("a")
img=soup.find(itemprop="image")
g_d4=soup.find_all("ol", {"class":"breadcrumb"})
for item in g_d4:
links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/'))
pattern_2=re.compile("clothing/(\w+)")
for link in links_2:
match_1=pattern_2.search(link["href"])
if match_1:
print ("Category:- " + match_1.group(1))
break
g_d1 = soup.find_all("div", {"id": "product-content"})
for item in g_d1:
try:
print ("\n\nBRAND:-" + item.contents[1].text)
except:
pass
try:
a_1=item.find("ol", {"class":"breadcrumb"})
a_2=a_1.text
print a_2
except:
pass
try:
print ("TYPE:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';')
except:
pass
try:
d2=item.find("div",{"class":"panel-body standard-p"})
d3=d2.text
p_id=re.findall(r'[0-9]{9}',d3)
id_2=p_id[0]
url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7Citem_page.rr1%7Citem_page.featured_item_0%7Citem_page.featured_item_1%7Citem_page.featured_item_2%7Citem_page.featured_item_3&u=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&s=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&cts=http%3A%2F%2Fwww.barneys.com&chi=%7Cmens-shirts-dress-classic&flv=18.0.0&rcs=eF4NyjEOgCAMBdCFybs0obQfyg28BhRIHNzU88v68sJxf881TDUSq6hYTimWomRGm9gkh9fPZo21olN3qbT3ogUYOcATzpgRP7a2EmY&l=1'
r_1= requests.get(url_1)
pattern = re.compile(r'(?<=p=)[0-9]+(?=&)')
product_ids = pattern.findall(str(r_1.content))
print ("DETAILS:- " + d3+';')
print ("\nStyle ID:- " + id_2+';')
print ("\nRecommended Product ID's:- ")
print (','.join(i for i in product_ids))
except:
pass
try:
print ("\nURL:-" + img["src"]+';')
except:
pass
try:
print ("\nFull Price:-" + item.find("span",{"class":"price-standard"}).text+';')
except:
pass
try:
print ("\nDiscounted Price:-" + item.find("span",{"class":"price-sales"}).text+';')
except:
pass
g_d2=soup.find_all("div", {"class":"color-scroll"})
pattern_1=re.compile("pid=(\w+)")
for item in g_d2:
links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/Sites-BNY-Site/default/Product-Variation'))
for link in links_1[1:]:
match=pattern_1.search(link["href"])
if match:
print ("\nProduct ID of other color:-")
print (match.group(1))
I added a dictionary called d
import requests, re
from bs4 import BeautifulSoup
d={}
url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2"
r=requests.get(url)
soup=BeautifulSoup(r.content)
links = soup.find_all("a")
d["links"] = []
d["links"].append(("href", [link.get("href") for link in links]))
d["links"].append(("class", [link.get("class") for link in links]))
img=soup.find(itemprop="image")
d["img"] = []
d["img"].append([("alt", img.get("alt")), ("src", img.get("src")), ("itemprop", img.get("itemprop")), ("class", img.get("class")[0])]) #You will have to put d["img"]["0"] instead of d["img"]["alt"]
g_d4=soup.find_all("ol", {"class":"breadcrumb"})
for item in g_d4:
links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/'))
pattern_2=re.compile("clothing/(\w+)")
for link in links_2:
match_1=pattern_2.search(link["href"])
if match_1:
print ("Category:- " + match_1.group(1))
break
g_d1 = soup.find_all("div", {"id": "product-content"})
for item in g_d1:
try:
d["Brand"] = item.contents[1].text
print ("\n\nBRAND:-" + item.contents[1].text)
except:
pass
try:
a_1=item.find("ol", {"class":"breadcrumb"})
a_2=a_1.text
d["a_2"] = a_2
print a_2
except:
pass
try:
print ("TYPE:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';')
d["Type"] = item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text
except:
pass
try:
d2=item.find("div",{"class":"panel-body standard-p"})
d3=d2.text
p_id=re.findall(r'[0-9]{9}',d3)
id_2=p_id[0]
url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7Citem_page.rr1%7Citem_page.featured_item_0%7Citem_page.featured_item_1%7Citem_page.featured_item_2%7Citem_page.featured_item_3&u=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&s=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&cts=http%3A%2F%2Fwww.barneys.com&chi=%7Cmens-shirts-dress-classic&flv=18.0.0&rcs=eF4NyjEOgCAMBdCFybs0obQfyg28BhRIHNzU88v68sJxf881TDUSq6hYTimWomRGm9gkh9fPZo21olN3qbT3ogUYOcATzpgRP7a2EmY&l=1'
r_1= requests.get(url_1)
pattern = re.compile(r'(?<=p=)[0-9]+(?=&)')
product_ids = pattern.findall(str(r_1.content))
print ("DETAILS:- " + d3+';')
d["Details"] = d3.split(",")
print ("\nStyle ID:- " + id_2+';')
d["Style"] = ("ID", id_2)
print ("\nRecommended Product ID's:- ")
print (','.join(i for i in product_ids))
d["RecommendedProductIDs"] = [i for i in product_ids]
except:
pass
try:
print ("\nURL:-" + img["src"]+';')
except:
pass
try:
print ("\nFull Price:-" + item.find("span",{"class":"price-standard"}).text+';')
except:
pass
try:
print ("\nDiscounted Price:-" + item.find("span",{"class":"price-sales"}).text+';')
except:
pass
g_d2=soup.find_all("div", {"class":"color-scroll"})
pattern_1=re.compile("pid=(\w+)")
for item in g_d2:
links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/Sites-BNY-Site/default/Product-Variation'))
for link in links_1[1:]:
match=pattern_1.search(link["href"])
if match:
print ("\nProduct ID of other color:-")
print (match.group(1))

Categories

Resources