I'm making this web scrape for a project but it only returns one of the values i'm looking for instead of also running the other 18 elements in listings. It will return all the information on 1 house but i want the information on the other 18 houses also stored in the variables. Thanks very much.
'''
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
my_url = "https://www.daft.ie/ireland/property-for-sale/"
#open connection and grab webpage
uClient = uReq(my_url)
#store html in a variable
page_html = uClient.read()
#close web connection
uClient.close()
#parse html
soup = BeautifulSoup(page_html, "html.parser")
print(soup)
#grabs listings house information
listings = soup.findAll("div", {"class":"FeaturedCardPropertyInformation__detailsContainer"})
for container in listings:
#extracting price
price= container.div.div.strong.text
#location
name_container = container.div.find("a", {"class":"PropertyInformationCommonStyles__addressCopy-
-link"}).text
#house type
house = container.div.find("div", {"class":"QuickPropertyDetails__propertyType"}).text
#number of bathrooms
bath_num = container.div.find("div", {"class":"QuickPropertyDetails__iconCopy--
WithBorder"}).text
#number of bedrooms
bed_num = container.div.find("div", {"class":"QuickPropertyDetails__iconCopy"}).text
'''
You can simply create a blank list before for loop and append all the variable in every iteration to store all data in a single list.
Your code will look as follows:
data = []
for container in listings:
# extracting price
price = container.div.div.strong.text
# location
name_container = container.div.find("a", {"class": "PropertyInformationCommonStyles__addressCopy--link"}).text
# house type
house = container.div.find("div", {"class": "QuickPropertyDetails__propertyType"}).text
# number of bathrooms
bath_num = container.div.find("div", {"class": "QuickPropertyDetails__iconCopy--WithBorder"}).text
# number of bedrooms
bed_num = container.div.find("div",{"class": "QuickPropertyDetails__iconCopy"}).text
data.append((price, name_container, house, bath_num, bed_num))
print(data)
Your final output will look as follows:
[('€1,350,000', 'The Penthouse at Hanover Quay, 27 Hanover Dock, Grand Canal Dock, Dublin 2', 'Apartment for sale', '2', '3'), ('€450,000', '9 Na Ceithre Gaoithe Ring, Dungarvan, Co. Waterford', ' Detached House', '4', '5'), ('€390,000', 'Cave, Caherlistrane, Co. Galway', ' Detached House', '4', '5'), ('€720,000', '18 Hazelbrook Road, Terenure, Terenure, Dublin 6', ' Detached House', '3', '4'), ('€210,000', 'Carraig Abhainn, Ballisodare, Co. Sligo', 'Bungalow for sale', '1', '3'), ('€495,000', 'Campbell Court, Cairns Hill, Sligo, Co. Sligo', ' Detached House', '4', '4'), ('€125,000', '33 Leim An Bhradain, Gort Road, Ennis, Co. Clare', 'Apartment for sale', '2', '2'), ('€395,000', '1 Windermere Court, Bishopstown, Bishopstown, Co. Cork', ' End of Terrace House', '3', '4'), ('€349,000', '59 Dun Eoin, Ballinrea Road, Carrigaline, Co. Cork', ' Detached House', '3', '4'), ('€515,000', '2 Elm Walk, Classes Lake, Ovens, Co. Cork', ' Detached House', '5', '4'), ('€490,000', '9 Munster st., Phibsborough, Dublin 7', ' Terraced House', '2', '4'), ('€249,950', '47 Westfields, Clare Road, Ennis, Co. Clare', ' Detached House', '3', '4'), ('€435,000', '3 Castlelough Avenue, Loreto Road, Killarney, Co. Kerry', ' Detached House', '3', '4'), ('€620,000', 'Beaufort House, Knockacleva, Philipstown, Dunleer, Dunleer, Co. Louth', ' Detached House', '3', '5'), ('€550,000', "Flat 5, Saint Ann's Apartments, Donnybrook, Dublin 4", 'Apartment for sale', '2', '2'), ('€675,000', '3 Church Hill, Innishannon, Co. Cork', ' Detached House', '3', '5'), ('€495,000', 'River Lodge, The Rower, Inistioge, Co. Kilkenny', ' Detached House', '4', '4'), ('€325,000', 'Coolgarrane House, Coolgarrane, Thurles, Co. Tipperary', ' Detached House', '1', '4'), ('€399,950', 'No 14 Coopers Grange Old Quarter, Ballincollig, Co. Cork', ' Semi-Detached House', '3', '4')]
Related
I'm trying to scrape this website, but I still found this error each time, although there is text in the span tag, the code show error:
line 49, in <module>
beds.append(bed[i].text)
IndexError: list index out of range
I want scrape All beds text in each advertisement page
# 1 lists
district_Name = []
property_size = []
property_price = []
links = []
dates = []
beds = []
paths = []
page_num = 0
# 2 the link of website
while True:
result = requests.get(f"https://www.bayut.sa/en/riyadh-region/villas-for-sale/page-{page_num}/")
src = result.content
# 4 create soup
soup = BeautifulSoup(src, "lxml")
if page_num > 288:
break
# 5 titles we need: districtName, property Age, size, rooms, price
districtName = soup.findAll("div", {"aria-label": "Location"})
size = soup.findAll("span", {"aria-label": "Area"})
price = soup.findAll("span",{"aria-label": "Price"})
listing_link = soup.findAll("a", {"aria-label": "Listing link"})
bed = soup.findAll("span", {"aria-label": "Beds"}, {"class": "b6a29bc0"})
path = soup.findAll("span", {"aria-label": "Beds"}, {"class": "b6a29bc0"})
main_url= 'https://www.bayut.sa'
# 6 for loop to get text and append it to a list
for i in range(len(districtName)):
district_Name.append(districtName[i].text)
links.append(main_url+listing_link[i].attrs["href"])
property_size.append(size[i].text)
property_price.append(price[i].text)
beds.append(bed[i].text)
paths.append(path[i].text)
for link in (links):
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
date = soup.find("span", {"aria-label":"Reactivated date"})
dates.append(date.text)
page_num+=1
In case if there is a information e.g. bed missing your list will have different length, so you have to handle this cases.
'bed' : e.text if (e := item.find("span", {"aria-label": "Beds"}, {"class": "b6a29bc0"})) else None,
In my opinion, you should rethink the way you process the data, move away from the abundance of lists and focus on a more structured approach.
Example
import requests
from bs4 import BeautifulSoup
data = []
page_num = 0
while True:
result = requests.get(f"https://www.bayut.sa/en/riyadh-region/villas-for-sale/page-{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
main_url= 'https://www.bayut.sa'
for item in soup.select('li[aria-label="Listing"]'):
data.append({
'districtName' : item.find("div", {"aria-label": "Location"}).text,
'size' : item.find("span", {"aria-label": "Area"}).text,
'price' : item.find("span",{"aria-label": "Price"}).text,
'listing_link' : main_url+item.find("a", {"aria-label": "Listing link"}).attrs["href"],
'bed' : e.text if (e := item.find("span", {"aria-label": "Beds"}, {"class": "b6a29bc0"})) else None,
'path' : item.find("span", {"aria-label": "Beds"}, {"class": "b6a29bc0"}).text
})
#....
#....
page_num+=1
if page_num > 3:
break
data
Output
[{'districtName': 'Al Rimal, East Riyadh, Riyadh, Riyadh Region', 'size': '300 Sq. M.', 'price': '1,500,000', 'listing_link': 'https://www.bayut.sa/en/property/details-87488480.html', 'bed': '6', 'path': '6'}, {'districtName': 'Al Yarmuk, East Riyadh, Riyadh, Riyadh Region', 'size': '285 Sq. M.', 'price': '1,300,000', 'listing_link': 'https://www.bayut.sa/en/property/details-87488484.html', 'bed': '4', 'path': '4'}, {'districtName': 'Al Khaleej, East Riyadh, Riyadh, Riyadh Region', 'size': '280 Sq. M.', 'price': '1,750,000', 'listing_link': 'https://www.bayut.sa/en/property/details-87488611.html', 'bed': '4', 'path': '4'}, {'districtName': 'Al Arid, North Riyadh, Riyadh, Riyadh Region', 'size': '420 Sq. M.', 'price': '3,000,000', 'listing_link': 'https://www.bayut.sa/en/property/details-87488444.html', 'bed': '4', 'path': '4'}, {'districtName': 'Ishbiliyah, East Riyadh, Riyadh, Riyadh Region', 'size': '900 Sq. M.', 'price': '4,000,000', 'listing_link': 'https://www.bayut.sa/en/property/details-87488471.html', 'bed': '4', 'path': '4'}, {'districtName': 'Al Malqa, North Riyadh, Riyadh, Riyadh Region', 'size': '600 Sq. M.', 'price': '7,100,000', 'listing_link': 'https://www.bayut.sa/en/property/details-87488439.html', 'bed': '5', 'path': '5'}, {'districtName': 'Al Narjis, North Riyadh, Riyadh, Riyadh Region', 'size': '312 Sq. M.', 'price': '2,600,000', 'listing_link': 'https://www.bayut.sa/en/property/details-87488396.html', 'bed': '4', 'path': '4'}, {'districtName': 'Al Malqa, North Riyadh, Riyadh, Riyadh Region', 'size': '720 Sq. M.', 'price': '8,700,000', 'listing_link': 'https://www.bayut.sa/en/property/details-87488443.html', 'bed': '5', 'path': '5'}, {'districtName': 'Al Khaleej, East Riyadh, Riyadh, Riyadh Region', 'size': '450 Sq. M.', 'price': '2,500,000', 'listing_link': 'https://www.bayut.sa/en/property/details-87488472.html', 'bed': '5', 'path': '5'}, {'districtName': 'Al Malqa, North Riyadh, Riyadh, Riyadh Region', 'size': '375 Sq. M.', 'price': '4,800,000', 'listing_link': 'https://www.bayut.sa/en/property/details-87488526.html', 'bed': '5', 'path': '5'},...]
I'm trying to extract a table from a webpage that I'm working on to store the headers as keys and the body as values but separately to denote which page they're from. Here's what I have tried:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
s=Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
driver.maximize_window()
driver.get('https://www.google.com')
all_data = []
for i in range(1,6):
url = "https://www.transfermarkt.co.uk/silvio-adzic/profil/spieler/{}".format(i)
driver.get(url)
time.sleep(3)
data = {}
soup = BeautifulSoup(driver.page_source, 'html5lib')
print(f"In Page {i}")
for th in soup.select("#yw2 tr"):
data[th.get_text(strip = True)] = th.find_next('td').get_text(strip=True)
all_data.append(data)
However this produces a jumbled dictionary:
[{'competitionwettbewerb': 'Total :',
'Total :25169524616.948': 'Total :',
'Regionalliga Süd': 'Regionalliga Süd',
'Regionalliga Süd7922-2876.318': '',
'2. Bundesliga': '2. Bundesliga',
'2. Bundesliga60933873.487': '',
'RL West-Südwest': 'RL West-Südwest',
'RL West-Südwest5818-1943.493': '',
'Oberliga Südwest': 'Oberliga Südwest',
'Oberliga Südwest2015-1101.649': '',
'Bundesliga': 'Bundesliga',
'Bundesliga1212355355': '',
..
..
..
(Expected outcome) is there a way to separate these for each page that's extracted so something like this?
[{'p1':{'competition': ["Regionalliga Süd", "2. Bundesliga", ...],
'Appearances': [79, 60,...],
'Goals':[22, 9,...],
'Assists':[-, 3, ...]
...},
'p2':{'competition': ["Bundesliga", "2. Bundesliga", ...],
'Appearances': [262, 98,...],
'Goals':[62, 18,...],
'Assists':[79, -, ...]
...}}]
This needs more complex code to work with every row and every cell separatelly.
First I create place for all data
data = {'competition': [], 'Appearances': [], 'Goals':[], 'Assists':[]}
Next I use for-loop to get rows in table.
But there are two problems:
some tr are empty but they don't have class so it is easy to skip them. I use also tbody to skip row in header.
on some pages it uses ID yw2 and on other yw1 but both are in div with data-viewport=Leistungsdaten_Saison so it is easy to get correct tables.
for tr in soup.select("div[data-viewport=Leistungsdaten_Saison] tbody tr[class]"):
Next I get all cells in row and put values in correct lists.
cells = tr.find_all('td')
#print(cells)
data['competition'].append(cells[1].get_text(strip=True))
data['Appearances'].append(cells[2].get_text(strip=True))
data['Goals'] .append(cells[3].get_text(strip=True))
data['Assists'] .append(cells[4].get_text(strip=True))
And finally I put data in all_data with key p1, p2, etc.
all_data.append({'p{}'.format(i): data})
Full working code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
driver.maximize_window()
all_data = []
for i in range(1, 6):
print('--- page', i, '---')
url = "https://www.transfermarkt.co.uk/silvio-adzic/profil/spieler/{}".format(i)
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html5lib')
data = {'competition': [], 'Appearances': [], 'Goals':[], 'Assists':[]}
for tr in soup.select("div[data-viewport=Leistungsdaten_Saison] tbody tr[class]"):
cells = tr.find_all('td')
#print(cells)
data['competition'].append(cells[1].get_text(strip=True))
data['Appearances'].append(cells[2].get_text(strip=True))
data['Goals'] .append(cells[3].get_text(strip=True))
data['Assists'] .append(cells[4].get_text(strip=True))
all_data.append({'p{}'.format(i): data})
# --- display ---
for player in all_data:
name, data = list(player.items())[0]
print('---', name, '---')
for key, value in data.items():
print(key, value)
Result:
--- p1 ---
competition ['Regionalliga Süd', '2. Bundesliga', 'RL West-Südwest', 'Oberliga Südwest', 'Bundesliga', 'NOFV-Oberliga Süd', 'Oberliga Bayern', 'DFB-Pokal', 'UEFA Cup']
Appearances ['79', '60', '58', '20', '12', '9', '6', '6', '1']
Goals ['22', '9', '18', '15', '1', '-', '2', '2', '-']
Assists ['-', '3', '-', '-', '2', '-', '-', '-', '-']
--- p2 ---
competition ['Bundesliga', '2. Bundesliga', 'DFB-Pokal', 'Champions League', '2. BL North', 'UEFA Cup', 'VL Südwest', '2. BL Nord Aufstiegsr.', 'Ligapokal', "Cup Winners' Cup", 'DFB-SuperCup', 'UI Cup', 'Champions League Qu.', 'Südwestpokal', 'Intertoto-Cup (until 94/95)']
Appearances ['262', '98', '38', '23', '21', '15', '11', '9', '6', '4', '2', '2', '1', '1', '0']
Goals ['62', '18', '9', '7', '3', '4', '1', '2', '2', '2', '-', '2', '-', '1', '-']
Assists ['79', '-', '5', '3', '-', '5', '-', '-', '-', '1', '2', '-', '-', '-', '-']
# ...
I tried lot of suggestions but I am unable to remove carriage returns. I am new python and trying it with csv file cleaning.
import csv
filepath_i = 'C:\Source Files\Data Source\Flat File Source\PatientRecords.csv'
filepath_o = 'C:\Source Files\Data Source\Flat File Source\PatientRecords2.csv'
rows = []
with open(filepath_i, 'rU', newline='') as csv_file:
#filtered = (line.replace('\r\n', '') for line in csv_file)
filtered = (line.replace('\r', '') for line in csv_file)
csv_reader = csv.reader(csv_file, delimiter=',')
i = 0
for row in csv_reader:
print(row)
i = i + 1
if(i == 10):
break
#with open(filepath_o, 'w',newline='' ) as writeFile:
# writer = csv.writer(writeFile,lineterminator='\r')
# for row in csv_reader:
# #rows.append(row.strip())
# rows.append(row.strip())
# writer.writerows(rows)
Input
DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region Description,Hospital Category,Hospital Type, Total Discharges ,Covered Charges , Total Payments ,Medicare Payments
039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,AL - Dothan,Specialty Centers,Government Funded,91,"$32,963.07 ","$5,777.24 ","$4,763.73 "
039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10005,MARSHALL MEDICAL CENTER SOUTH,"2505 U S HIGHWAY
431 NORTH",BOAZ,AL,35957,AL - Birmingham,Specialty Centers,Private Institution,14,"$15,131.85 ","$5,787.57 ","$4,976.71 "
039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10006,ELIZA COFFEE MEMORIAL HOSPITAL,205 MARENGO STREET,FLORENCE,AL,35631,AL - Birmingham,Rehabilitation Centers,Private Institution,24,"$37,560.37 ","$5,434.95 ","$4,453.79 "
Output (4th column 'Provider Street Address')
['DRG Definition', 'Provider Id', 'Provider Name', 'Provider Street Address', 'Provider City', 'Provider State', 'Provider Zip Code', 'Hospital Referral Region Description', 'Hospital Category', 'Hospital Type', ' Total Discharges ', 'Covered Charges ', ' Total Payments ', 'Medicare Payments']
['039 - EXTRACRANIAL PROCEDURES W/O CC/MCC', '10001', 'SOUTHEAST ALABAMA MEDICAL CENTER', '1108 ROSS CLARK CIRCLE', 'DOTHAN', 'AL', '36301', 'AL - Dothan', 'Specialty Centers', 'Government Funded', '91', '$32,963.07 ', '$5,777.24 ', '$4,763.73 ']
['039 - EXTRACRANIAL PROCEDURES W/O CC/MCC', '10005', 'MARSHALL MEDICAL CENTER SOUTH', '2505 U S HIGHWAY \n431 NORTH', 'BOAZ', 'AL', '35957', 'AL - Birmingham', 'Specialty Centers', 'Private Institution', '14', '$15,131.85 ', '$5,787.57 ', '$4,976.71 ']
I ran this on my side and it works:
with open(filepath_i, 'rU', newline='') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for row in csv_reader:
row[3] = row[3].replace("\n","").replace("\r","")
print(row)
Output:
['DRG Definition', 'Provider Id', 'Provider Name', 'Provider Street Address', 'Provider City', 'Provider State', 'Provider Zip Code', 'Hospital Referral Region Description', 'Hospital Category', 'Hospital Type', ' Total Discharges ', 'Covered Charges ', ' Total Payments ', 'Medicare Payments']
['039 - EXTRACRANIAL PROCEDURES W/O CC/MCC', '10001', 'SOUTHEAST ALABAMA MEDICAL CENTER', '1108 ROSS CLARK CIRCLE', 'DOTHAN', 'AL', '36301', 'AL - Dothan', 'Specialty Centers', 'Government Funded', '91', '$32,963.07 ', '$5,777.24 ', '$4,763.73 ']
['039 - EXTRACRANIAL PROCEDURES W/O CC/MCC', '10005', 'MARSHALL MEDICAL CENTER SOUTH', '2505 U S HIGHWAY 431 NORTH', 'BOAZ', 'AL', '35957', 'AL - Birmingham', 'Specialty Centers', 'Private Institution', '14', '$15,131.85 ', '$5,787.57 ', '$4,976.71 ']
['039 - EXTRACRANIAL PROCEDURES W/O CC/MCC', '10006', 'ELIZA COFFEE MEMORIAL HOSPITAL', '205 MARENGO STREET', 'FLORENCE', 'AL', '35631', 'AL - Birmingham', 'Rehabilitation Centers', 'Private Institution', '24', '$37,560.37 ', '$5,434.95 ', '$4,453.79 ']
I have written a script which is opening multiple tabs one by one and taking data from there. Now I am able to get data from the page but when writing in CSV file getting data as per below.
Bedrooms Bathrooms Super area Floor Status
3 See Dimensions 3 See Dimensions 2100 7 (Out of 23 Floors) 3 See Dimensions
Bedrooms Bathrooms Super area Floor Status
3 See Dimensions 3 See Dimensions 2100 7 (Out of 23 Floors) 3 See Dimensions
Bedrooms Bathrooms Super area Floor Status
1 1 520 4 (Out of 40 Floors) 1
Bedrooms Bathrooms Super area Floor Status
3 See Dimensions 3 See Dimensions 2100 7 (Out of 23 Floors) 3 See Dimensions
Bedrooms Bathrooms Super area Floor Status
1 1 520 4 (Out of 40 Floors) 1
In the Status column i am getting wrong value.
I have tried:
# Go through of them and click on each.
for unique_link in my_needed_links:
unique_link.click()
time.sleep(2)
driver.switch_to_window(driver.window_handles[1])
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
search_entries = [
("Bedrooms", "//div[#class='seeBedRoomDimen']"),
("Bathrooms", "//div[#class='p_value']"),
("Super area", "//span[#id='coveredAreaDisplay']"),
("Floor", "//div[#class='p_value truncated']"),
("Lift", "//div[#class='p_value']")]
with open('textfile.csv', 'a+') as f_output:
csv_output = csv.writer(f_output)
# Write header
csv_output.writerow([name for name, xpath in search_entries])
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
csv_output.writerows(zip(*entries))
get_elements_by_xpath(driver, xpath)
Edit
Entries: as list
[['3 See Dimensions'], ['3 See Dimensions', '4', '3', '1', '2100 sqft', '1400 sqft', '33%', 'Avenue 54', 'Under Construction', "Dec, '20", 'New Property', '₹ 7.90 Cr ₹ 39,50,000 Approx. Registration Charges ₹ 15 Per sq. Unit Monthly\nSee Other Charges', "Santacruz West, Mumbai., Santacruz West, Mumbai - Western Suburbs, Maharashtra What's Nearby", "Next To St Teresa's Convent School & Sacred Heart School on SV Road.", 'East', 'P51800007149 (The project has been registered via MahaRERA registration number: P51800007149 and is available on the website https://maharera.mahaonline.gov.in under registered projects.)', 'Garden/Park, Pool, Main Road', 'Marble, Marbonite, Wooden', '1 Covered', '24 Hours Available', 'No/Rare Powercut', '6', '6', 'Unfurnished', 'Municipal Corporation of Greater Mumbai', 'Freehold', 'Brokers please do not contact', ''], ['2100'], ['7 (Out of 23 Floors)'], ['3 See Dimensions', '4', '3', '1', '2100 sqft', '1400 sqft', '33%', 'Avenue 54 1 Discussion on forum', 'Under Construction', "Dec, '20", 'New Property', '₹ 7.90 Cr ₹ 39,50,000 Approx. Registration Charges ₹ 15 Per sq. Unit Monthly\nSee Other Charges', "Santacruz West, Mumbai., Santacruz West, Mumbai - Western Suburbs, Maharashtra What's Nearby", "Next To St Teresa's Convent School & Sacred Heart School on SV Road.", 'East', 'P51800007149 (The project has been registered via MahaRERA registration number: P51800007149 and is available on the website https://maharera.mahaonline.gov.in under registered projects.)', 'Garden/Park, Pool, Main Road', 'Marble, Marbonite, Wooden', '1 Covered', '24 Hours Available', 'No/Rare Powercut', '6', '6', 'Unfurnished', 'Municipal Corporation of Greater Mumbai', 'Freehold', 'Brokers please do not contact', '']]
[['3 See Dimensions'], ['3 See Dimensions', '4', '3', '1', '2100 sqft', '1400 sqft', '33%', 'Avenue 54 1 Discussion on forum', 'Under Construction', "Dec, '20", 'New Property', '₹ 7.90 Cr ₹ 39,50,000 Approx. Registration Charges ₹ 15 Per sq. Unit Monthly\nSee Other Charges', "Santacruz West, Mumbai., Santacruz West, Mumbai - Western Suburbs, Maharashtra What's Nearby", "Next To St Teresa's Convent School & Sacred Heart School on SV Road.", 'East', 'P51800007149 (The project has been registered via MahaRERA registration number: P51800007149 and is available on the website https://maharera.mahaonline.gov.in under registered projects.)', 'Garden/Park, Pool, Main Road', 'Marble, Marbonite, Wooden', '1 Covered', '24 Hours Available', 'No/Rare Powercut', '6', '6', 'Unfurnished', 'Municipal Corporation of Greater Mumbai', 'Freehold', 'Brokers please do not contact', ''], ['2100'], ['7 (Out of 23 Floors)'], ['3 See Dimensions', '4', '3', '1', '2100 sqft', '1400 sqft', '33%', 'Avenue 54 1 Discussion on forum', 'Under Construction', "Dec, '20", 'New Property', '₹ 7.90 Cr ₹ 39,50,000 Approx. Registration Charges ₹ 15 Per sq. Unit Monthly\nSee Other Charges', "Santacruz West, Mumbai., Santacruz West, Mumbai - Western Suburbs, Maharashtra What's Nearby", "Next To St Teresa's Convent School & Sacred Heart School on SV Road.", 'East', 'P51800007149 (The project has been registered via MahaRERA registration number: P51800007149 and is available on the website https://maharera.mahaonline.gov.in under registered projects.)', 'Garden/Park, Pool, Main Road', 'Marble, Marbonite, Wooden', '1 Covered', '24 Hours Available', 'No/Rare Powercut', '6', '6', 'Unfurnished', 'Municipal Corporation of Greater Mumbai', 'Freehold', 'Brokers please do not contact', '']]
website link: https://www.magicbricks.com/propertyDetails/1-BHK-520-Sq-ft-Multistorey-Apartment-FOR-Sale-Kandivali-West-in-Mumbai&id=4d423333373433343431
Edit 1
my_needed_links = []
list_links = driver.find_elements_by_tag_name("a")
for i in range(0, 2):
# Get unique links.
for link in list_links:
if "https://www.magicbricks.com/propertyDetails/" in link.get_attribute("href"):
if link not in my_needed_links:
my_needed_links.append(link)
# Go through of them and click on each.
for unique_link in my_needed_links:
unique_link.click()
time.sleep(2)
driver.switch_to_window(driver.window_handles[1])
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
search_entries = [
("Bedrooms", "//div[#class='seeBedRoomDimen']"),
("Bathrooms", "//div[#class='p_value']"),
("Super area", "//span[#id='coveredAreaDisplay']"),
("Floor", "//div[#class='p_value truncated']"),
("Lift", "//div[#class='p_value']")]
#with open('textfile.csv', 'a+') as f_output:
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
data = [entry for entry in entries if len(entry)==28]
df = pd.DataFrame(data)
print (df)
df.to_csv('nameoffile.csv', mode='a',index=False,encoding='utf-8')
#df.to_csv('nameoffile.csv',mode='a', index=False,encoding='utf-8')
get_elements_by_xpath(driver, xpath)
time.sleep(2)
driver.close()
# Switch back to the main tab/window.
driver.switch_to_window(driver.window_handles[0])
Thank you in advance. Please suggest something
The xpath for bathrooms and for lift are the same, therefore you get the same results in these columns. Try to find another way to identify and distinguish between them. You can probably use an index, though if there's another way it's usually preferred.
I could not load the page due to my location. But from your entries, you could do:
#Your selenium imports
import pandas as pd
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
for unique_link in my_needed_links:
unique_link.click()
time.sleep(2)
driver.switch_to_window(driver.window_handles[1])
search_entries = [
("Bedrooms", "//div[#class='seeBedRoomDimen']"), ("Bathrooms", "//div[#class='p_value']"),("Super area", "//span[#id='coveredAreaDisplay']"),("Floor", "//div[#class='p_value truncated']"),("Lift", "//div[#class='p_value']")]
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
data = [entry for entry in entries if len(entry)>5]
df = pd.DataFrame(data)
df.drop_duplicates(inplace=True)
df.to_csv('nameoffile.csv', sep=';',index=False,encoding='utf-8',mode='a')
get_elements_by_xpath(driver, xpath)
I have a list like this:
list = [['Amy Pond', 'R$30 of awesome for R$10', '10.0', '5', '456 Unreal Rd', "Tom's Awesome Shop\n"], ['Marty McFly', 'R$20 Sneakers for R$5', '5.0', '1', '123 Fake St Sneaker', 'Store Emporium\n'], ['Snake Plissken', 'R$20 Sneakers for R$5', '5.0', '4', '123 Fake St Sneaker', 'Store Emporium']]
My model is:
class Vendas(models.Model):
purchasername=models.CharField(u'Nome do Comprador',max_length=100)
itemdescription=models.CharField(u'Descrição do Item',max_length=100)
itemprice=models.IntegerField(u'Preço do Item')
purchasecount=models.IntegerField(u'Quantidade comprada')
merchantaddress=models.CharField(u'Endereço do comprador',max_length=100)
merchantname=models.CharField(u'Nome do Comprador',max_length=100)
I want to load this list as is into the table Vendas.
Can someone help me?
Try this:
for i in list:
a = Vendas(purchasername=i[0], itemdescription=i[1], itemprice=i[2], purchasecount=i[3], merchantaddress=i[4], merchantname=i[5])
a.save()