I am quite new in scraping with xpath. I am trying to scrape product information on Target. I use selenium and xpath successfully get the price and name. But xpath cannot return any value when scraping for product sizeproduct size and sales locationsales location.
For example, for this url"https://www.target.com/p/pataday-once-daily-relief-extra-strength-drops-0-085-fl-oz/-/A-83775159?preselect=81887758#lnk=sametab", xpath for size is:
//*[#id="pageBodyContainer"]/div[1]/div[2]/div[2]/div/div[3]/div/div[1]/text()
xpath for sales location is:
//*[#id="pageBodyContainer"]/div[1]/div[2]/div[2]/div/div[1]/div[2]/span
I also try to get these two elements by using requests but it also did not work. Do anyone know why it happened? Any help appreciated. Thanks.
Following is my code:
def job_function():
urlList = ['https://www.target.com/p/pataday-once-daily-relief-extra-strength-drops-0-085-fl-oz/-/A-83775159?preselect=81887758#lnk=sametab',
'https://www.target.com/p/kleenex-ultra-soft-facial-tissue/-/A-84780536?preselect=12964744#lnk=sametab',
'https://www.target.com/p/claritin-24-hour-non-drowsy-allergy-relief-tablets-loratadine/-/A-80354268?preselect=14351285#lnk=sametab',
'https://www.target.com/p/opti-free-pure-moist-rewetting-drops-0-4-fl-oz/-/A-14358641#lnk=sametab'
]
def ScrapingTarget(url):
AArray = []
wait_imp = 10
CO = webdriver.ChromeOptions()
CO.add_experimental_option('useAutomationExtension', False)
CO.add_argument('--ignore-certificate-errors')
CO.add_argument('--start-maximized')
wd = webdriver.Chrome(r'D:\chromedriver\chromedriver_win32new\chromedriver_win32 (2)\chromedriver.exe',
options=CO)
wd.get(url)
wd.implicitly_wait(wait_imp)
sleep(1)
#start scraping
name = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[1]/h1/span").text
sleep(0.5)
price = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[1]/div[1]/span").text
sleep(0.5)
try:
size = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[3]/div/div[1]/text()").text
except:
size = "none"
sleep(0.5)
try:
sales location = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[1]/div[2]/span").text
except:
sales location = "none"
tz = pytz.timezone('Etc/GMT-0')
GMT = datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
AArray.append([name, price, size, sales location, GMT])
with open(
r'C:\Users\12987\PycharmProjects\python\Network\priceingAlgoriCoding\export_Target_dataframe.csv',
'a', newline="", encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerows(AArray)
with concurrent.futures.ThreadPoolExecutor(4) as executor:
executor.map(ScrapingTarget, urlList)
sched = BlockingScheduler()
sched.add_job(job_function,'interval',seconds=60)
sched.start()
Related
I am trying to scrape some product price in Target using Selenium chrome driver in Python. It will also keep inputting zip code and scrape new price. And i expected the program can keep running on the server for 1 year.
The code runs successfully for a few hours at the beginning on the server, however, approximately 5 hours later, the server's administrator end my code and said process is consuming a lot of /tmp space. I guess it is because the selenium and chrome driver but not quit sure. Any help appreciated. Thanks
/tmp space(free space: / 3765 MB (53% inode=70%):
/home 699 MB (81% inode=99%):
/opt 1169 MB (43% inode=99%):
/tmp 25 MB (1% inode=55%):
/usr/local 819 MB (95% inode=99%):
/var 5957 MB (78% inode=99%)
urlList = [.......]
data = read_csv("C:\\Users\\12987\\desktop\\zipcode\\zc.csv")
# converting column data to list
zipCodeList = data['Zipcode'].tolist()
while(True):
AArray = []
def ScrapingTarget(url):
wait_imp = 10
CO = webdriver.ChromeOptions()
CO.add_experimental_option('useAutomationExtension', False)
CO.add_argument('--ignore-certificate-errors')
CO.add_argument('--start-maximized')
wd = webdriver.Chrome(r'D:\chromedriver\chromedriver_win32new\chromedriver_win32 (2)\chromedriver.exe',options=CO)
wd.get(url)
wd.implicitly_wait(wait_imp)
for zipcode in zipCodeList:
# click the My Store
myStore = wd.find_element(by=By.XPATH, value="//*[#id='web-store-id-msg-btn']/div[2]/div")
myStore.click()
sleep(0.5)
#input ZipCode
inputZipCode = wd.find_element(by=By.XPATH, value="//*[#id='zip-or-city-state']")
inputZipCode.clear()
inputZipCode.send_keys(zipcode)
#click lookup
clickLoopUP = wd.find_element(by=By.XPATH, value="//*[#id='overlay-1']/div[2]/div[1]/div/div[3]/div[2]/button")
clickLoopUP.click()
sleep(0.5)
#choose Store
store = wd.find_element(by=By.XPATH, value="//*[#id='overlay-1']/div[2]/div[3]/div[2]/div[1]/button")
store.click()
#start scraping
name = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[1]/h1/span").text
#nameArray.append(name)
price = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[1]/div[1]/span").text
#priceArray.append(price)
currentZipCode = zipcode
#zipCodeArray.append(currentZipCode)
tz = pytz.timezone('Europe/London')
GMT = datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
#GMTArray.append(GMT)
# needed to click onto the "Show more" to get the tcin and upc
xpath = '//*[#id="tabContent-tab-Details"]/div/button'
element_present = EC.presence_of_element_located((By.XPATH, xpath))
WebDriverWait(wd, 5).until(element_present)
showMore = wd.find_element(by=By.XPATH, value=xpath)
sleep(2)
showMore.click()
soup = BeautifulSoup(wd.page_source, 'html.parser')
# gets a list of all elements under "Specifications"
div = soup.find("div", {"class": "styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight"})
list = div.find_all("div")
for a in range(len(list)):
list[a] = list[a].text
# locates the elements in the list
tcin = [v for v in list if v.startswith("TCIN")]
upc = [v for v in list if v.startswith("UPC")]
#TCIN.append(tcin)
#UPC.append(upc)
#scroll up
#wd.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
wd.find_element(by=By.TAG_NAME, value='body').send_keys(Keys.CONTROL + Keys.HOME)
AArray.append([name, price, currentZipCode, tcin, upc, GMT])
with concurrent.futures.ThreadPoolExecutor(10) as executor:
executor.map(ScrapingTarget, urlList)
with open(r'C:\Users\12987\PycharmProjects\python\Network\priceingAlgoriCoding\export_Target_dataframe.csv',
'a', newline="", encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerows(AArray)
sleep(3600)
Very new to Python and Selenium, looking to scrape a few data points. I'm struggling in three areas:
I don't understand how to loop through multiple URLs properly
I can't figure out why the script is iterating twice over each URL
I can't figure out why it's only outputting the data for the second URL
Much thanks for taking a look!
Here's my current script:
urls = [
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.crutchfield.com/%2F&tab=mobile',
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.lastpass.com%2F&tab=mobile'
]
driver = webdriver.Chrome(executable_path='/Library/Frameworks/Python.framework/Versions/3.9/bin/chromedriver')
for url in urls:
for page in range(0, 1):
driver.get(url)
wait = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'origin-field-data')))
df = pd.DataFrame(columns = ['Title', 'Core Web Vitals', 'FCP', 'FID', 'CLS', 'TTI', 'TBT', 'Total Score'])
company = driver.find_elements_by_class_name("audited-url__link")
data = []
for i in company:
data.append(i.get_attribute('href'))
for x in data:
#Get URL name
title = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[1]/div/div[2]/h1/a')
co_name = title.text
#Get Core Web Vitals text pass/fail
cwv = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[1]/span[2]')
core_web = cwv.text
#Get FCP
fcp = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[1]/div')
first_content = fcp.text
#Get FID
fid = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[3]/div[1]/div')
first_input = fid.text
#Get CLS
cls = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[4]/div[1]/div')
layout_shift = cls.text
#Get TTI
tti = driver.find_element_by_xpath('//*[#id="interactive"]/div/div[1]')
time_interactive = tti.text
#Get TBT
tbt = driver.find_element_by_xpath('//*[#id="total-blocking-time"]/div/div[1]')
total_block = tbt.text
#Get Total Score
total_score = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[1]/div/div[1]/a/div[2]')
score = total_score.text
#Adding all columns to dataframe
df.loc[len(df)] = [co_name,core_web,first_content,first_input,layout_shift,time_interactive,total_block,score]
driver.close()
#df.to_csv('Double Page Speed Test 9-10.csv')
print(df)
Q1 : I don't understand how to loop through multiple URLs properly ?
Ans : for url in urls:
Q2. I can't figure out why the script is iterating twice over each URL
Ans : Cause you have for page in range(0, 1):
Update 1:
I did not run your entire code with DF. Also sometimes either one of the pages, does not show the number and href, but when I typically run the below code,
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(50)
wait = WebDriverWait(driver, 20)
urls = [
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.crutchfield.com/%2F&tab=mobile',
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.lastpass.com%2F&tab=mobile'
]
data = []
for url in urls:
driver.get(url)
wait = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'origin-field-data')))
company = driver.find_elements_by_css_selector("h1.audited-url a")
for i in company:
data.append(i.get_attribute('href'))
print(data)
this output :
['https://www.crutchfield.com//', 'https://www.lastpass.com/', 'https://www.lastpass.com/']
which is true case the element locator that we have used is representing 1 element on page 1 or 2 element on page 2
I am a student working on a scraping project and I am having trouble completing my script because it fills my computer's memory with all of the data is stores.
It currently stores all of my data until the end, so my solution to this would be to break up the scrape into smaller bits and then write out the data periodically so it does not just continue to make one big list and then write out at the end.
In order to do this, I would need to stop my scroll method, scrape the loaded profiles, write out the data that I have collected, and then repeat this process without duplicating my data. It would be appreciated if someone could show me how to do this. Thank you for your help :)
Here's my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
driver = webdriver.Chrome()
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
Test.py
Displaying Test.py.
Try the below approach using requests and beautifulsoup. In the below script i have used the API URL fetched from website itself for ex:-API URL
First it will create the URL(refer first url) for first iteration, add headers and data in .csv file.
Second iteration it will again create the URL(refer second url) with 2 extra params start_on_page=20 & show_per_page=20 where start_on_page number 20 is incremented by 20 on each iteration and show_per_page = 100 defaulted to extract 100 records per iteration so on till all the data dumped in to the .csv file.second iteration API URL
Script is dumping 4 things number, name, location and profile url.
On each iteration data will be appended to .csv file , so your memory issue will get resolved by this approach.
Do not forget to add your system path in file_path variable where do you want to create .csv file before running the script.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
import csv
def scrap_directory_data():
list_of_credentials = []
file_path = ''
file_name = 'credential_list.csv'
count = 0
page_number = 0
page_size = 100
create_url = ''
main_url = 'https://directory.bcsp.org/search_results.php?'
first_iteration_url = 'first_name=&last_name=&city=&state=&country=&certification=&unauthorized=0&retired=0&specialties=&industries='
number_of_records = 0
csv_headers = ['#','Name','Location','Profile URL']
while True:
if count == 0:
create_url = main_url + first_iteration_url
print('-' * 100)
print('1 iteration URL created: ' + create_url)
print('-' * 100)
else:
create_url = main_url + 'start_on_page=' + str(page_number) + '&show_per_page=' + str(page_size) + '&' + first_iteration_url
print('-' * 100)
print('Other then first iteration URL created: ' + create_url)
print('-' * 100)
page = requests.get(create_url,verify=False)
extracted_text = bs(page.text, 'lxml')
result = extracted_text.find_all('tr')
if len(result) > 0:
for idx, data in enumerate(result):
if idx > 0:
number_of_records +=1
name = data.contents[1].text
location = data.contents[3].text
profile_url = data.contents[5].contents[0].attrs['href']
list_of_credentials.append({
'#':number_of_records,
'Name':name,
'Location': location,
'Profile URL': profile_url
})
print(data)
with open(file_path + file_name ,'a+') as cred_CSV:
csvwriter = csv.DictWriter(cred_CSV, delimiter=',',lineterminator='\n',fieldnames=csv_headers)
if idx == 0 and count == 0:
print('Writing CSV header now...')
csvwriter.writeheader()
else:
for item in list_of_credentials:
print('Writing data rows now..')
print(item)
csvwriter.writerow(item)
list_of_credentials = []
count +=1
page_number +=20
scrap_directory_data()
So I am scraping listings off of Craigslist and my list of titles, prices, and dates are being overwritten every time the web driver goes to the next page. In the end, the only data in my .csv file and MongoDB collection are the listings on the last page.
I tried moving the instantiation of the lists but it still overwrites.
the function that extracts listing information from a page
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name("result-row")
dates = []
titles = []
prices = []
for post in all_posts:
title = post.text.split("$")
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(" ")
month = title[0]
day = title[1]
title = ' '.join(title[2:])
date = month + " " + day
if not price[:1].isdigit():
price = "0"
int(price)
titles.append(title)
prices.append(price)
dates.append(date)
return titles, prices, dates
The function that runs going to url and going to next page until there is no more next page
def load_craigslist_url(self):
self.driver.get(self.url)
while True:
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID, "searchform")))
print("Page is loaded")
self.extract_post_information()
WebDriverWait(self.driver, 2).until(
EC.element_to_be_clickable((By.XPATH, '//*[#id="searchform"]/div[3]/div[3]/span[2]/a[3]'))).click()
except:
print("Last page")
break
My main
if __name__ == "__main__":
filepath = '/home/diego/git_workspace/PyScrape/data.csv' # Filepath of written csv file
location = "philadelphia" # Location Craigslist searches
postal_code = "19132" # Postal code Craigslist uses as a base for 'MILES FROM ZIP'
max_price = "700" # Max price Craigslist limits the items too
query = "graphics+card" # Type of item you are looking for
radius = "400" # Radius from postal code Craigslist limits the search to
# s = 0
scraper = CraigslistScraper(location, postal_code, max_price, query, radius)
scraper.load_craigslist_url()
titles, prices, dates = scraper.extract_post_information()
d = [titles, prices, dates]
export_data = zip_longest(*d, fillvalue='')
with open('data.csv', 'w', encoding="utf8", newline='') as my_file:
wr = csv.writer(my_file)
wr.writerow(("Titles", "Prices", "Dates"))
wr.writerows(export_data)
my_file.close()
# scraper.kill()
scraper.upload_to_mongodb(filepath)
what I expect it to do is get all the info from one page, go to next page, get all of thats page info and append to the three lists titles, prices, and dates in the extract_post_information function. Once there are no more next pages, create a list out of those three lists called d (seen in my main function)
Should I put the extract_post_information function in the load_craigslist_url function? Or do I have to tweak where I instantiate the three lists in the extract_post _informtaion function?
In the load_craigslist_url() function, you're calling self.extract_post_information() without saving the returned information.
I have tried many times, but it does not work:
import requests
from lxml import html, etree
from selenium import webdriver
import time, json
#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'
url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page=1&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'
driver = webdriver.Chrome()
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
i = int(1)
while True:
name_string = '//*[#id="plist"]/ul/li[%d]/div/div[3]/a/em/text()' %(i)
price_string = '//*[#id="plist"]/ul/li[%d]/div/div[2]/strong[1]/i/text()' %(i)
if i == 60:
break
else:
i += 1
name = selctor.xpath(name_string)[0]
name_data.append(name)
price = selctor.xpath(price_string)[0]
price_data.append(price)
jd_goods_data[name] = price
print(name_data)
with open(file_name, 'w') as f:
json.dump(jd_goods_data, f)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="J_bottomPage"]/span[1]/a[10]').click()
time.sleep(2)
# for k, v in jd_goods_data.items():
# print(k,v)
I am trying to download some details, but it doesn't work. If you type 2 to scan, it only downloads one page details, but twice!
Ok, you define q but you do not actually use it as such. In this case, the convention is to name this unused variable as _. I mean, instead of doing
for q in range(page_num):
you should do
for _ in range(page_num):
Thus, other programers will directly know that you do not use q, and only want your operation to be repeated.
Which means that (for some reasons) the line driver.find_element_by_xpath('//*[#id="J_bottomPage"]/span[1]/a[10]').click() does not execute correctly. For sure there is a way to make it work. But in your case, I heuristically see that your url contains a parameter whose name is page. I recommend you to use it instead. Which thus leads to actually use the variable q as such., as follows:
import requests
from lxml import html,etree
from selenium import webdriver
import time, json
#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'
driver = webdriver.Chrome()
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page={page}&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'.format(page=q)
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
i = 1
while True:
name_string = '//*[#id="plist"]/ul/li[%d]/div/div[3]/a/em/text()' %(i)
price_string = '//*[#id="plist"]/ul/li[%d]/div/div[2]/strong[1]/i/text()' %(i)
if i == 60:
break
else:
i += 1
name = selctor.xpath(name_string)[0]
name_data.append(name)
price = selctor.xpath(price_string)[0]
price_data.append(price)
jd_goods_data[name] = price
print(name_data)
with open(file_name, 'w') as f:
json.dump(jd_goods_data, f)
driver.quit()