python combining 2 re.findall strings in columns and rows in csv

python combining 2 re.findall strings in columns and rows in csv - python

#!/usr/bin/env python
import re
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://salesweb.civilview.com/Sales/SalesSearch?countyId=32')
soup = BeautifulSoup(page.text, 'html.parser')
list_ = soup.find(class_='table-striped')
list_items = list_.find_all('tr')
content = list_items
d = re.findall(r"<td>\d*/\d*/\d+</td>",str(content))
#d = re.findall(r"<td>\d*/\d*/\d+</td>|<td>\d*?\s.+\d+</td>",str(content))
a = re.findall(r"<td>\d*?\s.+\d+?.*</td>",str(content))
res = d+a
for tup in res:
tup = re.sub("<td>",'',str(tup))
tup = re.sub("</td>",'',str(tup))
print(tup)
I'm getting sale dates then addresses when just printing to screen. I have tried several things to get to csv but I end up all data in 1 column or 1 row. I would like to just sale dates, addresses 2 columns with all returned rows.
This is what I get just using print()
8/25/2021
9/1/2021
9/1/2021
9/1/2021
9/1/2021
9/1/2021
9/8/2021
9/8/2021
9/8/2021
9/8/2021
9/15/2021
9/15/2021
9/15/2021
9/15/2021
9/15/2021
9/15/2021
9/22/2021
9/29/2021
9/29/2021
9/29/2021
11/17/2021
4/30/3021
40 PAVILICA ROAD STOCKTON NJ 08559
129 KINGWOOD LOCKTOWN ROAD FRENCHTOWN NJ 08825
63 PHLOX COURT WHITEHOUSE STATION NJ 08889
41 WESTCHESTER TERRACE UNIT 11 CLINTON NJ 08809
461 LITTLE YORK MOUNT PLEASANT ROAD MILFORD NJ 08848
9 MAPLE AVENUE FRENCHTOWN NJ 08825
95 BARTON HOLLOW ROAD FLEMINGTON NJ 08822
27 WORMAN ROAD STOCKTON NJ 08559
30 COLD SPRINGS ROAD CALIFON NJ 07830
211 OLD CROTON ROAD FLEMINGTON NJ 08822
3 BRIAR LANE FLEMINGTON NJ 08822(VACANT)
61 N. FRANKLIN STREET LAMBERTVILLE NJ 08530
802 SPRUCE HILLS DRIVE GLEN GARDNER NJ 08826
2155 STATE ROUTE 31 GLEN GARDNER NJ 08826
80 SCHAAF ROAD BLOOMSBURY NJ 08804
9 CAMBRIDGE DRIVE MILFORD NJ 08848
5 VAN FLEET ROAD NESHANIC STATION NJ 08853
34 WASHINGTON STREET ANNANDALE NJ 08801
229 MILFORD MT PLEASANT ROAD MILFORD NJ 08848
1608 COUNTY ROAD 519 FRENCHTOWN NJ 08825
29 OLD SCHOOLHOUSE ROAD ASBURY NJ 08802
28 ROSE RUN LAMBERTVILLE NJ 08530
Any Help would be great I have been playing with this all day and can't seem to get it right no matter what I try

My two cents :
#!/usr/bin/env python
import re
import requests
from bs4 import BeautifulSoup
import csv
separator = ','
page = requests.get('https://salesweb.civilview.com/Sales/SalesSearch?countyId=32')
soup = BeautifulSoup(page.text, 'html.parser')
list_ = soup.find(class_='table-striped')
list_items = list_.find_all('tr')
content = list_items
d = re.findall(r"<td>\d*/\d*/\d+</td>",str(content))
a = re.findall(r"<td>\d*?\s.+\d+?.*</td>",str(content))
for date, address in zip(d, a):
print(re.sub("</td>|<td>",'',str(date)),
separator,
re.sub("</td>|<td>",'',str(address)))
Output, date and address are now in one row:
8/25/2021 , 40 PAVILICA ROAD STOCKTON NJ 08559
9/1/2021 , 129 KINGWOOD LOCKTOWN ROAD FRENCHTOWN NJ 08825
9/1/2021 , 63 PHLOX COURT WHITEHOUSE STATION NJ 08889
9/1/2021 , 41 WESTCHESTER TERRACE UNIT 11 CLINTON NJ 08809
9/1/2021 , 461 LITTLE YORK MOUNT PLEASANT ROAD MILFORD NJ 08848
9/1/2021 , 9 MAPLE AVENUE FRENCHTOWN NJ 08825
9/8/2021 , 95 BARTON HOLLOW ROAD FLEMINGTON NJ 08822
9/8/2021 , 27 WORMAN ROAD STOCKTON NJ 08559
9/8/2021 , 30 COLD SPRINGS ROAD CALIFON NJ 07830
9/8/2021 , 211 OLD CROTON ROAD FLEMINGTON NJ 08822
9/15/2021 , 3 BRIAR LANE FLEMINGTON NJ 08822(VACANT)
9/15/2021 , 61 N. FRANKLIN STREET LAMBERTVILLE NJ 08530
9/15/2021 , 802 SPRUCE HILLS DRIVE GLEN GARDNER NJ 08826
9/15/2021 , 2155 STATE ROUTE 31 GLEN GARDNER NJ 08826
9/15/2021 , 80 SCHAAF ROAD BLOOMSBURY NJ 08804
9/15/2021 , 9 CAMBRIDGE DRIVE MILFORD NJ 08848
9/22/2021 , 5 VAN FLEET ROAD NESHANIC STATION NJ 08853
9/29/2021 , 34 WASHINGTON STREET ANNANDALE NJ 08801
9/29/2021 , 229 MILFORD MT PLEASANT ROAD MILFORD NJ 08848
9/29/2021 , 1608 COUNTY ROAD 519 FRENCHTOWN NJ 08825
11/17/2021 , 29 OLD SCHOOLHOUSE ROAD ASBURY NJ 08802
4/30/3021 , 28 ROSE RUN LAMBERTVILLE NJ 08530
Extra, to export to CSV using pandas :
import pandas as pd
date_list = []
address_list = []
for date, address in zip(d, a):
date_list.append(re.sub("</td>|<td>",'',str(date)))
address_list.append(re.sub("</td>|<td>",'',str(address)))
df = pd.DataFrame([date_list, address_list]).T
df.columns = ['Date', 'Address']
df.to_csv('data.csv')

It seems to me that instead of using two regular expressions you should rather use one with named groups. I leave it to you to try.
Given that you have two corresponding lists of values, the simplest way would be instead of concatenating:
res = d+a
just going through pairs of them:
for tup, tup2 in zip(d, a):
tup = re.sub("<td>",'',str(tup))
tup = re.sub("</td>",'',str(tup))
tup2 = re.sub("<td>",'',str(tup2))
tup2 = re.sub("</td>",'',str(tup2))
print(tup, tup2)

#!/usr/bin/env python
import re
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://salesweb.civilview.com/Sales/SalesSearch?countyId=32')
soup = BeautifulSoup(page.text, 'html.parser')
list_ = soup.find(class_='table-striped')
list_items = list_.find_all('tr')
content = list_items
d = re.findall(r"<td>\d*/\d*/\d+</td>",str(content)) #this is a list
#d = re.findall(r"<td>\d*/\d*/\d+</td>|<td>\d*?\s.+\d+</td>",str(content))
a = re.findall(r"<td>\d*?\s.+\d+?.*</td>",str(content)) #this is a list
## create a dataframe with two lists and remove tags
df = pd.DataFrame(list(zip(d,a)), columns=['sales_date','address'])
for cols in df.columns:
df[cols] = df[cols].map(lambda x: x.lstrip('<td>').rstrip('</td>'))
df.to_csv("result.csv")

Related

beautifulsoup find text between span

I want to get just a text from span:
html = <a class="business-name" data-analytics='{"click_id":1600,"target":"name","feature_click":""}' href="/new-york-ny/bpp/upper-eastside-orthodontists-20151" rel=""><span>Upper Eastside Orthodontists</span></a>
name = html.find('a', {'class', 'business-name'})
print(name.find('span').text)
give me results:
print(name.find('span').text)
AttributeError: 'NoneType' object has no attribute 'text'
I want to get just the text: Upper Eastside Orthodontists

What you are actually looking for is not in the static/initial request. The page is rendered dynamically.
Luckily the data does come in under the <script> tags, and you can pull out the json and parse it from there:
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
url = 'https://www.superpages.com/new-york-ny/dentists?page=1'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
script = soup.find_all('script', {'type':"application/ld+json"})[-2]
p = re.compile('({.*})')
result = p.search(str(script))
data = json.loads(result.group(0))
df = pd.DataFrame(data['mainEntity']['itemListElement'])
Output:
print(df.to_string())
#type name url
0 ItemPage Upper Eastside Orthodontists https://www.superpages.com/new-york-ny/bpp/upper-eastside-orthodontists-20151
1 ItemPage Kara https://www.superpages.com/new-york-ny/bpp/kara-5721648
2 ItemPage Central Park West Dentistry https://www.superpages.com/new-york-ny/bpp/central-park-west-dentistry-471054528
3 ItemPage Majid Rajabi Khamesi Advanced Family Dental https://www.superpages.com/new-york-ny/bpp/majid-rajabi-khamesi-advanced-family-dental-542761105
4 ItemPage Robert Veligdan, DMD, PC https://www.superpages.com/new-york-ny/bpp/robert-veligdan-dmd-pc-21238912
5 ItemPage Irina Rossinski, DDS https://www.superpages.com/new-york-ny/bpp/irina-rossinski-dds-462447740
6 ItemPage Dr. Michael J. Wei https://www.superpages.com/new-york-ny/bpp/dr-michael-j-wei-504012551
7 ItemPage Manhattan Dental Spa https://www.superpages.com/new-york-ny/bpp/manhattan-dental-spa-22612348
8 ItemPage Expert Dental PC https://www.superpages.com/new-york-ny/bpp/expert-dental-pc-459327373
9 ItemPage Dr. Jonathan Freed, D.D.S., P.C. https://www.superpages.com/new-york-ny/bpp/dr-jonathan-freed-d-d-s-p-c-503142997
10 ItemPage Clifford S. Melnick, DMD PC https://www.superpages.com/new-york-ny/bpp/clifford-s-melnick-dmd-pc-512698216
11 ItemPage Ronald Birnbaum Dds https://www.superpages.com/new-york-ny/bpp/ronald-birnbaum-dds-2757412
12 ItemPage Concerned Dental Care https://www.superpages.com/new-york-ny/bpp/concerned-dental-care-453434343
13 ItemPage DownTown Dental Cosmetic Center https://www.superpages.com/new-york-ny/bpp/downtown-dental-cosmetic-center-468569119
14 ItemPage Beth Caunitz, D.D.S. https://www.superpages.com/new-york-ny/bpp/beth-caunitz-d-d-s-479935675
15 ItemPage Alice Urbankova DDS, P https://www.superpages.com/new-york-ny/bpp/alice-urbankova-dds-p-474879958
16 ItemPage Wu Darryl DDS PC https://www.superpages.com/new-york-ny/bpp/wu-darryl-dds-pc-8291524
17 ItemPage Gerald Rosen DDS https://www.superpages.com/new-york-ny/bpp/gerald-rosen-dds-470302208
18 ItemPage Group Health Dental https://www.superpages.com/new-york-ny/bpp/group-health-dental-15648711
19 ItemPage Dr. Shaun Massiah, DMD https://www.superpages.com/new-york-ny/bpp/dr-shaun-massiah-dmd-453290181
20 ItemPage Park 56 Dental https://www.superpages.com/new-york-ny/bpp/park-56-dental-479624928?lid=1001970746762
21 ItemPage Rubin Esther S https://www.superpages.com/new-york-ny/bpp/rubin-esther-s-462458952
22 ItemPage David P Pitman DMD https://www.superpages.com/new-york-ny/bpp/david-p-pitman-dmd-9139813
23 ItemPage Daniell Jason Mishaan, DMD https://www.superpages.com/new-york-ny/bpp/daniell-jason-mishaan-dmd-479623764
24 ItemPage Dolman Oral Surgery https://www.superpages.com/new-york-ny/bpp/dolman-oral-surgery-534333982
25 ItemPage Emagen Dental https://www.superpages.com/new-york-ny/bpp/emagen-dental-460512214
26 ItemPage The Exchange Dental Group https://www.superpages.com/new-york-ny/bpp/the-exchange-dental-group-462981940
27 ItemPage Joshua M. Wilges DDS & Associates https://www.superpages.com/new-york-ny/bpp/joshua-m-wilges-dds-associates-497873451
28 ItemPage Oren Rahmanan, DDS https://www.superpages.com/new-york-ny/bpp/oren-rahmanan-dds-472633138
29 ItemPage Victoria Veytsman, DDS https://www.superpages.com/new-york-ny/bpp/victoria-veytsman-dds-456826960
You could then iterate through each link to get the data from their page.
The other option which is a little tricky is I did find it within the html. It's only tricky in that you need to cut out the excess (there's the sponsor ad, and then more after the initial 30 results, that don't follow the same html structure/pattern)
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
url = 'https://www.superpages.com/new-york-ny/dentists?page=1'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
businesses = soup.find_all('a', {'class':'business-name'})
rows = []
for each in businesses[1:31]:
name = each.text
address = each.find_next('div', {'class':'street-address'}).text
phone = each.find_next('a', {'class':'phones phone primary'}).text.replace('Call Now','')
row = {'name':name,
'address':address,
'phone':phone}
rows.append(row)
df = pd.DataFrame(rows)
Output:
print(df.to_string())
name address phone
0 Upper Eastside Orthodontists 153 E 87th St Apt 1b, New York, NY, 10128 888-378-2976
1 Kara 30 E 60th St Rm 503, New York, NY, 10022 212-355-2195
2 Central Park West Dentistry 25 W 68th St, New York, NY, 10023 212-579-8885
3 Majid Rajabi Khamesi Advanced Family Dental 30 E 40th St Rm 705, New York, NY, 10016 212-481-2535
4 Robert Veligdan, DMD, PC 343 W 58th St, New York, NY, 10019 212-832-2330
5 Irina Rossinski, DDS 30 5th Ave Apt 1g, New York, NY, 10011 212-673-3700
6 Dr. Michael J. Wei 425 Madison Ave.20th Floor, New York, NY, 10017 646-798-6490
7 Manhattan Dental Spa 200 Madison Ave Ste 2201, New York, NY, 10016 212-683-2530
8 Expert Dental PC 110 E 40th St Rm 104, New York, NY, 10016 212-682-2965
9 Dr. Jonathan Freed, D.D.S., P.C. 315 Madison Ave Rm 509, New York, NY, 10017 212-682-5644
10 Clifford S. Melnick, DMD PC 41 W 58th St Apt 2e, New York, NY, 10019 212-355-1266
11 Ronald Birnbaum Dds 425 W 59th St, New York, NY, 10019 212-523-8030
12 Concerned Dental Care 30 E 40th St Rm 207, New York, NY, 10016 212-696-4979
13 DownTown Dental Cosmetic Center 160 Broadway, New York, NY, 10038 212-964-3337
14 Beth Caunitz, D.D.S. 30 East 40th Street, Suite 406, New York, NY, 10016 212-206-9002
15 Alice Urbankova DDS, P 630 5th Ave Ste 1860, New York, NY, 10111 212-765-7340
16 Wu Darryl DDS PC 41 Elizabeth St, New York, NY, 10013 212-925-7757
17 Gerald Rosen DDS 59 E 54th St, New York, NY, 10022 212-753-9860
18 Group Health Dental 230 W 41st St, New York, NY, 10036 212-398-9690
19 Dr. Shaun Massiah, DMD 50 W 97th St Apt 1c, New York, NY, 10025 212-222-5225
20 Park 56 Dental 120 E 56th St Rm 610, New York, NY, 10022 347-770-3915
21 Rubin Esther S 18 E 48th St, New York, NY, 10017 212-593-7272
22 David P Pitman DMD 57 W 57th St Ste 707, New York, NY, 10019 212-888-2833
23 Daniell Jason Mishaan, DMD 241 W 37th St, New York, NY, 10018 212-730-4440
24 Dolman Oral Surgery 16 E 52nd St Ste 402, New York, NY, 10022 212-696-0167
25 Emagen Dental 250 8th Ave Apt 2s, New York, NY, 10011 212-352-9300
26 The Exchange Dental Group 39 Broadway Rm 2115, New York, NY, 10006 212-422-9229
27 Joshua M. Wilges DDS & Associates 2 West 45th Street Suite 1708, New York, NY, 10036 646-590-2100
28 Oren Rahmanan, DDS 1 Rockefeller Plz Rm 2223, New York, NY, 10020 212-581-6736
29 Victoria Veytsman, DDS 509 Madison Ave Rm 1704, New York, NY, 10022 212-759-6700

Beautiful soup scraping with selenium

I'm learning how to scrape using Beautiful soup with selenium and I found a website that has multiple tables and found table tags (first time dealing with them). I'm learning how to try to scrape those texts from each table and append each element to respected list. First im trying to scrape the first table, and the rest I want to do on my own. But I cannot access the tag for some reason.
I also incorporated selenium to access the sites, because when I copy the link to the site onto another tab, the list of tables disappears, for some reason.
My code so far:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.support.ui import Select
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
targetSite = "https://www.sdvisualarts.net/sdvan_new/events.php"
driver.get(targetSite)
select_event = Select(driver.find_element_by_name('subs'))
select_event.select_by_value('All')
select_loc = Select(driver.find_element_by_name('loc'))
select_loc.select_by_value("All")
driver.find_element_by_name("submit").click()
targetSite = "https://www.sdvisualarts.net/sdvan_new/viewevents.php"
event_title = []
name = []
address = []
city = []
state = []
zipCode = []
location = []
webSite = []
fee = []
event_dates = []
opening_dates = []
description = []
try:
page = requests.get(targetSite )
soup = BeautifulSoup(page.text, 'html.parser')
items = soup.find_all('table', {"class":"popdetail"})
for i in items:
event_title.append(item.find('b', {'class': "text"})).text.strip()
name.append(item.find('td', {'class': "text"})).text.strip()
address.append(item.find('td', {'class': "text"})).text.strip()
city.append(item.find('td', {'class': "text"})).text.strip()
state.append(item.find('td', {'class': "text"})).text.strip()
zipCode.append(item.find('td', {'class': "text"})).text.strip()
Can someone let me know if I am doing this correctly, This is my first time dealing with site's urls elements disappear when copied onto a new tab and/or window
So far, I am unable to append any information to each list.

One issue is with the for loop.
you have for i in items:, but then you are calling item instead of i.
And secondly, if you are using selenium to render the page, then you should probably use selenium to get the html. They also have some embedded tables within tables, so it's not as straight forward as iterating through the <table> tags. What I ended up doing was having pandas read in the tables (returns a list of dataframes), then iterating through those as there is a pattern of how the dataframes are constructed.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import Select
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
targetSite = "https://www.sdvisualarts.net/sdvan_new/events.php"
driver.get(targetSite)
select_event = Select(driver.find_element_by_name('subs'))
select_event.select_by_value('All')
select_loc = Select(driver.find_element_by_name('loc'))
select_loc.select_by_value("All")
driver.find_element_by_name("submit").click()
targetSite = "https://www.sdvisualarts.net/sdvan_new/viewevents.php"
event_title = []
name = []
address = []
city = []
state = []
zipCode = []
location = []
webSite = []
fee = []
event_dates = []
opening_dates = []
description = []
dfs = pd.read_html(driver.page_source)
driver.close
for idx, table in enumerate(dfs):
if table.iloc[0,0] == 'Event Title':
event_title.append(table.iloc[-1,0])
tempA = dfs[idx+1]
tempA.index = tempA[0]
tempB = dfs[idx+4]
tempB.index = tempB[0]
tempC = dfs[idx+5]
tempC.index = tempC[0]
name.append(tempA.loc['Name',1])
address.append(tempA.loc['Address',1])
city.append(tempA.loc['City',1])
state.append(tempA.loc['State',1])
zipCode.append(tempA.loc['Zip',1])
location.append(tempA.loc['Location',1])
webSite.append(tempA.loc['Web Site',1])
fee.append(tempB.loc['Fee',1])
event_dates.append(tempB.loc['Dates',1])
opening_dates.append(tempB.loc['Opening Days',1])
description.append(tempC.loc['Event Description',1])
df = pd.DataFrame({'event_title':event_title,
'name':name,
'address':address,
'city':city,
'state':state,
'zipCode':zipCode,
'location':location,
'webSite':webSite,
'fee':fee,
'event_dates':event_dates,
'opening_dates':opening_dates,
'description':description})
Output:
print (df.to_string())
event_title name address city state zipCode location webSite fee event_dates opening_dates description
0 The San Diego Museum of Art Welcomes a Special... San Diego Museum of Art 1450 El Prado, Balboa Park San Diego CA 92101 Central San Diego https://www.sdmart.org/ NaN Starts On 6-18-2020 Ends On 1-10-2021 Opens virtually on June 18. The work will beco... The San Diego Museum of Art is launching its f...
1 New Exhibit: Miller Dairy Remembered Lemon Grove Historical Society 3185 Olive Street, Treganza Heritage Park Lemon Grove CA 91945 Central San Diego http://www.lghistorical.org Children 12 and under free and must be accompa... Starts On 6-27-2020 Ends On 12-4-2020 Exhibit on view Saturdays 11 am to 2 pm; close... From 1926 there were cows smack in the midst o...
2 Gizmos and Shivelight Distinction Gallery 317 E. Grand Ave Escondido CA 92025 North County Inland http://www.distinctionart.com NaN Starts On 7-14-2020 Ends On 9-5-2020 08/08/20 - 09/05/20 Distinction Gallery is proud to present our so...
3 Virtual Opening - July Exhibitions Vision Art Museum 2825 Dewey Rd. Suite 100 San Diego CA 92106 Central San Diego http://www.visionsartmuseum.org Free Starts On 7-18-2020 Ends On 10-4-2020 NaN Join Visions Art Museum for a virtual exhibiti...
4 Laying it Bare: The Art of Walter Redondo and ... Fresh Paint Gallery 1020-B Prospect Street La Jolla CA 92037 Central San Diego http://freshpaintgallery.com/ NaN Starts On 8-1-2020 Ends On 9-27-2020 Tuesday through Sunday. Mondays closed. A two-person exhibit of new abstract expressio...
5 Online oil painting lessons with Concetta Antico NaN NaN NaN NaN NaN Virtual http://concettaantico.com/live-online-oil-pain... NaN Starts On 8-10-2020 Ends On 8-31-2020 NaN Anyone can learn to paint like the masters! Ov...
6 MOMENTUM: A Creative Industry Symposium Vanguard Culture Via Zoom San Diego California 92101 Virtual https://www.eventbrite.com/e/momentum-a-creati... $10 suggested donation Starts On 8-17-2020 Ends On 9-7-2020 NaN MOMENTUM: A Creative Industry Symposium Monday...
7 Virtual Locals Invitational Show Art & Frames of Coronado 936 ORANGE AVE Coronado CA 92118 0 https://www.artsteps.com/view/5eed0ad62cd0d65b... free Starts On 8-21-2020 Ends On 8-1-2021 NaN Art and Frames of Coronado invites you to our ...
8 HERE & Now R.B. Stevenson Gallery 7661 Girard Avenue, Suite 101 La Jolla California 92037 Central San Diego http://www.rbstevensongallery.com Free Starts On 8-22-2020 Ends On 9-25-2020 Tuesday through Saturday R.B.Stevenson Gallery is pleased to announce t...
9 Art Unites Learning: Normal 2.0 Art Unites NaN San Diego NaN 92116 Central San Diego https://www.facebook.com/events/956878098104971 Free Starts On 8-25-2020 Ends On 8-25-2020 NaN Please join us on Tuesday, August 25th as we: ...
10 Image Quest Sojourn; Visual Journaling for Per... Pamela Underwood Studios Virtual NaN NaN NaN Virtual http://www.pamelaunderwood.com/event/new-onlin... $595.00 Starts On 8-26-2020 Ends On 11-11-2020 NaN Create a personal Image Quest resource journal...
11 Behind The Exhibition: Southern California Con... Oceanside Museum of Art 704 Pier View Way Oceanside California 92054 Virtual https://oma-online.org/events/behind-the-exhib... No fee required. Donations recommended. Starts On 8-27-2020 Ends On 8-27-2020 NaN Join curator Beth Smith and exhibitions manage...
12 Lay it on Thick, a Virtual Art Exhibition San Diego Watercolor Society 2825 Dewey Rd Bldg #202 San Diego California 92106 0 https://www.sdws.org NaN Starts On 8-30-2020 Ends On 9-26-2020 NaN The San Diego Watercolor Society proudly prese...
13 The Forum: Marketing & Branding for Creatives Vanguard Culture Via Zoom San Diego CA 92101 South San Diego http://vanguardculture.com/ $5 suggested donation Starts On 9-1-2020 Ends On 9-1-2020 NaN Attention creative industry professionals! Joi...
14 Write or Die Solo Exhibition You Belong Here 3619 EL CAJON BLVD San Diego CA 92104 Central San Diego http://www.youbelongsd.com/upcoming-events/wri... $10 donation to benefit You Belong Here Starts On 9-4-2020 Ends On 9-6-2020 NaN Write or Die is an immersive installation and ...
15 SDVAN presents Art San Diego at Bread and Salt San Diego Visual Arts Network 1955 Julian Avenue San Digo CA 92113 Central San Diego http://www.sdvisualarts.net and https://www.br... Free Starts On 9-5-2020 Ends On 10-24-2020 NaN We are pleased to announce the four artist rec...
16 The Coming of Treganza Heritage Park Lemon Grove Historical Society 3185 Olive Street Lemon Grove CA 91945 Central San Diego http://www.lghistorical.org Free for all ages Starts On 9-10-2020 Ends On 9-10-2020 The park is open daily, 8 am to 8 pm. Covid 19... Lemon Grove\'s central city park will be renam...
17 Online oil painting course | 4 weeks NaN NaN NaN NaN NaN Virtual http://concettaantico.com/live-online-oil-pain... NaN Starts On 9-14-2020 Ends On 10-5-2020 NaN Over 4 weekly Zoom lessons, learn the techniqu...
18 Online oil painting course | 4 weeks NaN NaN NaN NaN NaN Virtual http://concettaantico.com/live-online-oil-pain... NaN Starts On 10-12-2020 Ends On 11-2-2020 NaN Over 4 weekly Zoom lessons, learn the techniqu...
19 36th Annual Mission Fed ArtWalk Mission Fed ArtWalk Ash Street San Diego California 92101 Central San Diego www.missionfedartwalk.org Free Starts On 11-7-2020 Ends On 11-8-2020 Sat and Sun Nov 7 and 8 Mission Fed ArtWalk returns to San Diego’s Lit...
20 Mingei Pop Up Workshop: My Daruma Doll New Childrens Museum 200 West Island Avenue San Diego California 92101 Central San Diego http://thinkplaycreate.org/ Free with admission Starts On 11-13-2020 Ends On 11-13-2020 NaN Join Mingei International Museum at The New Ch...

Problem concatenating URL and scraping data

I am trying to append a URL in python to scrape details from the target URL.
I have the below code but it seems to be scraping the data from url1 rather than URL.
I have scraped the team names from the NFL websit without any issue. The issue is with the spotrac URL where I am appending the team name which I have scraped from the NFL website.
import requests
from bs4 import BeautifulSoup
URL ='https://www.nfl.com/teams/'
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
team_name = []
team_name_list = soup.find_all('h4',class_='d3-o-media-object__roofline nfl-c-custom-promo__headline')
for team in team_name_list:
if team.find('p'):
team_name.append(team.text)
for team in team_name:
team = team.replace(" ", "-").lower()
url1 = 'https://www.spotrac.com/nfl/rankings/'
URL = url1 +str(team)
print(URL)
data = {
'ajax': 'true',
'mobile': 'false'
}
bs_soup = BeautifulSoup(requests.post(URL, data=data).content, 'html.parser')
spotrac_df = pd.DataFrame(columns = ['Name', 'Salary'])
for h3 in bs_soup.select('h3'):
spotrac_df = spotrac_df.append(pd.DataFrame({'Name': str(h3.text), 'Salary' : str(h3.find_next(class_="rank-value").text)}, index=[0]), ignore_index=False)
I'm almost certain the problem is coming from the URL not appending properly. The scraping is taking the salaries etc from url1 rather than URL.
My console output (using Spyder IDE) is as below for print(URL)

url is appending correctly, but you have a leading white space in your team names. I also made a few other changes and noted them in the code.
Lastly, (and I used to do this two), creating an empty dataframe then appending to it after each iteration I suppose isn't the best method. I've been told it better to construct your rows using lists/dictionaries, and then when done, then call on pandas to construct the dataframe, so changed that as well.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url ='https://www.nfl.com/teams/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
team_name = []
team_name_list = soup.find_all('h4',class_='d3-o-media-object__roofline nfl-c-custom-promo__headline')
for team in team_name_list:
if team.find('p'):
team_name.append(team.text.strip()) #<- remove leading/trailing white space
url1 = 'https://www.spotrac.com/nfl/rankings/' #<- since this is fixed, put it before the loop
spotrac_rows = []
for team in team_name:
team = '-'.join(team.split()).lower() #<- changed to split in case theres 2 spaces between city and team
url1 = 'https://www.spotrac.com/nfl/rankings/'
url = url1 + str(team)
print(url)
data = {
'ajax': 'true',
'mobile': 'false'
}
bs_soup = BeautifulSoup(requests.post(url, data=data).content, 'html.parser')
for h3 in bs_soup.select('h3'):
spotrac_rows.append({'Name': str(h3.text), 'Salary' : str(h3.find_next(class_="rank-value").text.strip())}) #<- remove white space from the salary
spotrac_df = pd.DataFrame(spotrac_rows)
Output:
print(spotrac_df)
Name Salary
0 Chandler Jones $21,333,333
1 Patrick Peterson $13,184,588
2 D.J. Humphries $12,800,000
3 DeAndre Hopkins $12,500,000
4 Larry Fitzgerald $11,750,000
5 Jordan Hicks $10,500,000
6 Justin Pugh $10,500,000
7 Kenyan Drake $8,483,000
8 Kyler Murray $8,080,601
9 Robert Alford $7,500,000
10 J.R. Sweezy $6,500,000
11 Corey Peters $4,437,500
12 Haason Reddick $4,288,444
13 Jordan Phillips $4,000,000
14 Isaiah Simmons $3,757,101
15 Maxx Williams $3,400,000
16 Zane Gonzalez $3,259,000
17 Devon Kennard $2,500,000
18 Budda Baker $2,173,184
19 De'Vondre Campbell $2,000,000
20 Andy Lee $2,000,000
21 Byron Murphy $1,815,795
22 Christian Kirk $1,607,691
23 Aaron Brewer $1,168,750
24 Max Garcia $1,143,125
25 Andy Isabella $1,052,244
26 Mason Cole $977,629
27 Zach Allen $975,855
28 Chris Banjo $887,500
29 Jonathan Bullard $887,500
... ...
2530 Khari Blasingame $675,000
2531 Kenneth Durden $675,000
2532 Cody Hollister $675,000
2533 Joey Ivie $675,000
2534 Greg Joseph $675,000
2535 Kareem Orr $675,000
2536 David Quessenberry $675,000
2537 Derick Roberson $675,000
2538 Shaun Wilson $675,000
2539 Cole McDonald $635,421
2540 Chris Jackson $629,570
2541 Kobe Smith $614,333
2542 Aaron Brewer $613,333
2543 Cale Garrett $613,333
2544 Tommy Hudson $613,333
2545 Kristian Wilkerson $613,333
2546 Khaylan Kearse-Thomas $612,500
2547 Nick Westbrook $612,333
2548 Kyle Williams $611,833
2549 Mason Kinsey $611,666
2550 Tucker McCann $611,666
2551 Cameron Scarlett $611,666
2552 Teair Tart $611,666
2553 Brandon Kemp $611,333
2554 Wyatt Ray $610,000
2555 Josh Smith $610,000
2556 Logan Woodside $610,000
2557 Rashard Davis $610,000
2558 Avery Gennesy $610,000
2559 Parker Hesse $610,000
[2560 rows x 2 columns]

How to find coordinates from a list of addresses in a dataframe

I am trying to create 2 columns in my dataframe for Longitude and Latitude which I want to find by using my address column called 'Details'.
I have tried from
geopy.extra.rate_limiter import RateLimiter
locator=Nominatim(user_agent="MyGeocoder")
results['location']=results['Details'].apply
results['point']=results['location'].apply(lambda loc:tuple(loc['point']) if loc else None)
results[['latitude', 'longitude',]]=pd.DataFrame(results['point'].tolist(), index=results.index)
But this gives the error "method object is not subscriptable"
I want to create a loop to get all coordinates for each address
Details Sale Price Post Code Year Sold
1 53 Eastbury Grove, London, W4 2JT Flat, Lease... 450000.0 W4 2020
2 Flat 148 Wedgwood House Lambeth Walk, London, ... 325000.0 E11 2020
3 63 Russell Road, Wimbledon, London, SW19 1QN ... 800000.0 W19 2020
4 Flat 2 9 Queens Gate Place, London, SW7 5NX F... 400000.0 W7 2020
5 83 Chingford Mount Road, London, E4 8LU Freeh... 182000.0 E4 2020
... ... ... ... ...
47 702 Rutherford Heights Rodney Road, London, SE... 554750.0 E17 2015
48 Flat 48 Highlands Court Highland Road, London,... 340000.0 E19 2015
49 5 Mount Nod Road, London, SW16 2LQ Flat, Leas... 395000.0 W16 2015
50 6 Woodmill Street, London, SE16 3GG Terraced,... 1010000.0 E16 2015
51 402 Rutherford Heights Rodney Road, London, SE... 403200.0 E17 2015
300 rows × 4 columns

Try this
import pandas as pd
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
def lat_long(row):
loc = locator.geocode(row["Details"])
row["latitude"] = loc.latitude
row["longitude"] = loc.longitude
return row
results.apply(lat_long, axis=1)

How to get all data using beautifulsoup?

I am trying to scrape all the addresses for "Recent Sales" in this page:
https://www.compass.com/agents/irene-vuong/
My current code looks like:
url = 'https://www.compass.com/agents/irene-vuong/'
url = requests.get(url)
soup = BeautifulSoup(url.text, 'html')
for item in soup.findAll('div', attrs={'class': 'uc-listingCard-content'}):
new = item.find('a', attrs={'class': 'uc-listingCard-title'})
print(new.text)
My output is :
256-258 Wyckoff Street
1320 Glenwood Road
1473 East 55th Street
145 Winter Avenue
25-02 Brookhaven Avenue
which is the addresses of "current" listings.
My expected output is:
352 94th Street
1754 West 12th Street
2283 E 23rd st
2063 Brown Street
3423 Avenue U
2256 Stuart Street
Which are the addresses under "Recent Sales". No matter what, I only get current listing addresses, but not all listing addresses. I tried to use re.compile(r'Recent Sales') but it would not work. I'm not sure how to get to "Recent Sales".
Any help will be greatly appreciated.
+++++
I also tried to use text 'Recent Sales' as below:
for item in soup.findAll(text=re.compile(r'Recent Sales')).findNext():
for i in item.find('div', attrs={'class':'profile-acive-listings'}):
new = i.find('a', attrs={'class': 'uc-listingCard-title'})
print(new.text)
But I get an error of:
AttributeError: ResultSet object has no attribute 'findNext'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
+++ Also tried to use class data-tn : recent sales:
for item in soup.findAll('div', attrs={'data-tn':'recent-sales'}):
new = item.findAll('a', attrs={'class': 'uc-listingCard-title'})
print(new.text)
But it won't return anything.

You can use Selenium. It renders your page in an automated browser. From the rendered page you can then get the full HTML and retrieve your listings.
Try this:
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
browser.get("https://www.compass.com/agents/irene-vuong/")
html = browser.page_source
soup = BeautifulSoup(html, 'html')
for item in soup.findAll('div', attrs={'class': 'uc-listingCard-content'}):
new = item.find('a', attrs={'class': 'uc-listingCard-title'})
print(new.text)
This prints out:
256-258 Wyckoff Street
1320 Glenwood Road
1473 East 55th Street
145 Winter Avenue
25-02 Brookhaven Avenue
352 94th Street
1754 West 12th Street
2283 E 23rd St
2063 Brown Street
3423 Avenue U
2256 Stuart Street
East 61st Street
Edit:
If you want to parse the data from the raw HTML you have to get it a script tag.
Try this:
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.compass.com/agents/irene-vuong/'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html')
script = soup.find_all("script")[4]
data = json.loads(script.text.split("window.__AGENT_PROFILE__ = ")[1])
data = data["data"]
df_sales = pd.DataFrame(data["closedDeals"]["sales"])
df_rentals = pd.DataFrame(data["closedDeals"]["rentals"])
This gives you Pandas dataframes with all the listing data like this.
listingIdSHA listingType location size price detailedInfo media dealInfo isOffMLS pageLink pageLinkSlug canonicalPageLink userListingCompliance
0 210837948508195937 2 {'prettyAddress': '352 94th Street', 'city': '... {'bedrooms': 4, 'bathrooms': 2.75} {'lastKnown': 1250000, 'formatted': '$1,250,000'} {'amenities': ['Driveway', 'Open Kitchen', 'Ga... [{'category': 0, 'thumbnailUrl': 'https://d278... {'disclaimer': 'No guarantee, warranty or repr... False /listing/352-94th-street-brooklyn-ny-11209/210... 352-94th-street-brooklyn-ny-11209 /listing/352-94th-street-brooklyn-ny-11209/210... {'descriptionCompliance': 0}
1 122690464561282785 2 {'prettyAddress': '1754 West 12th Street', 'ci... {'bedrooms': 4, 'bathrooms': 2} {'lastKnown': 1040000, 'formatted': '$1,040,000'} {'amenities': ['Basement', 'Private Outdoor Sp... [{'category': 0, 'thumbnailUrl': 'https://d278... {'disclaimer': 'No guarantee, warranty or repr... False /listing/1754-west-12th-street-brooklyn-ny-112... 1754-west-12th-street-brooklyn-ny-11223 /listing/1754-west-12th-street-brooklyn-ny-112... {'descriptionCompliance': 0}
2 NaN 2 {'prettyAddress': '2283 E 23rd St', 'neighborh... {'bedrooms': 3, 'bathrooms': 2} {'lastKnown': 800000, 'formatted': '$800,000'} NaN [{'category': 0, 'thumbnailUrl': 'https://d278... {'disclaimer': 'No guarantee, warranty or repr... False NaN 2283-e-23rd-st NaN NaN
3 235974146369023201 2 {'prettyAddress': '2063 Brown Street', 'city':... {'bedrooms': 3, 'bathrooms': 2} {'lastKnown': 755000, 'formatted': '$755,000'} NaN [{'category': 0, 'thumbnailUrl': 'https://d278... {'disclaimer': 'No guarantee, warranty or repr... False /listing/2063-brown-street-brooklyn-ny-11229/2... 2063-brown-street-brooklyn-ny-11229 /listing/2063-brown-street-brooklyn-ny-11229/2... {'descriptionCompliance': 0}
4 186865317970981409 2 {'prettyAddress': '3423 Avenue U', 'city': 'Br... {'bedrooms': 5, 'bathrooms': 2} {'lastKnown': 627000, 'formatted': '$627,000'} {'amenities': ['Hardwood Floors', 'Garage', 'C... [{'category': 0, 'thumbnailUrl': 'https://d278... {'disclaimer': 'No guarantee, warranty or repr... False /listing/3423-avenue-u-brooklyn-ny-11234/18686... 3423-avenue-u-brooklyn-ny-11234 /listing/3423-avenue-u-brooklyn-ny-11234/18686... {'descriptionCompliance': 0}
5 286987776170131617 2 {'prettyAddress': '2256 Stuart Street', 'city'... {'bedrooms': 3, 'bathrooms': 1} {'lastKnown': 533000, 'formatted': '$533,000'} NaN [{'category': 0, 'thumbnailUrl': 'https://d278... {'disclaimer': 'No guarantee, warranty or repr... False /listing/2256-stuart-street-brooklyn-ny-11229/... 2256-stuart-street-brooklyn-ny-11229 /listing/2256-stuart-street-brooklyn-ny-11229/...
To retrieve just the listing adresses use this further step:
from pandas import json_normalize
df_sales = df_sales.location.apply(lambda x: dict(x))
df_sales = json_normalize(df_sales)
df_rentals = df_rentals.location.apply(lambda x: dict(x))
df_rentals = json_normalize(df_rentals)
Output:
prettyAddress city state zipCode geoId neighborhood subNeighborhoods
0 352 94th Street Brooklyn NY 11209 nyc NaN NaN
1 1754 West 12th Street Brooklyn NY 11223 nyc NaN NaN
2 2283 E 23rd St NaN NaN NaN nyc Sheepshead Bay [Sheepshead Bay]
3 2063 Brown Street Brooklyn NY 11229 nyc NaN NaN
4 3423 Avenue U Brooklyn NY 11234 nyc NaN NaN
5 2256 Stuart Street Brooklyn NY 11229 nyc NaN NaN
Edit:
You can get more clean data like so:
df_sales = pd.DataFrame(data["closedDeals"]["sales"])
columns = ['listingIdSHA', 'listingType', 'location', 'size', 'price']
df_sales = df_sales[columns]
expanded_data = []
for column in ['location', 'size', 'price']:
expanded = df_sales[column].apply(lambda x: dict(x))
expanded_data.append(json_normalize(expanded))
expanded_data = pd.concat(expanded_data, axis=1)
df_sales_cleaned = pd.concat([df_sales[['listingIdSHA', 'listingType']], expanded_data], axis=1)
display(df_sales_cleaned)
Output:
listingIdSHA listingType prettyAddress city state zipCode geoId neighborhood subNeighborhoods bedrooms bathrooms lastKnown formatted
0 210837948508195937 2 352 94th Street Brooklyn NY 11209 nyc NaN NaN 4 2.75 1250000 $1,250,000
1 122690464561282785 2 1754 West 12th Street Brooklyn NY 11223 nyc NaN NaN 4 2.00 1040000 $1,040,000
2 NaN 2 2283 E 23rd St NaN NaN NaN nyc Sheepshead Bay [Sheepshead Bay] 3 2.00 800000 $800,000
3 235974146369023201 2 2063 Brown Street Brooklyn NY 11229 nyc NaN NaN 3 2.00 755000 $755,000
4 186865317970981409 2 3423 Avenue U Brooklyn NY 11234 nyc NaN NaN 5 2.00 627000 $627,000
5 286987776170131617 2 2256 Stuart Street Brooklyn NY 11229 nyc NaN NaN 3 1.00 533000 $533,000

i recently got a project where I'm using this too, but even without Regex
try the code like this
for item in soup.findAll(text=re.compile(r'Recent Sales')):
for i in item.encode_contents().find('div', {'class':'profile-acive-listings'}):
new = i.find('a', {'class': 'uc-listingCard-title'})
print(new.text)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

python combining 2 re.findall strings in columns and rows in csv - python

Related

beautifulsoup find text between span

Beautiful soup scraping with selenium

Problem concatenating URL and scraping data

How to find coordinates from a list of addresses in a dataframe

How to get all data using beautifulsoup?

Categories

Resources