I want to extract data from two html page .As I extact data from one page and going another page some element change ,data are present in list and list changes.
My code for below problem
details_containers = soup_page.findAll("div",{"id":"RESTAURANT_DETAILS"})
details_container = details_containers[0].findAll("div",{"class":"content"})
cuisine = details_container[0].text.strip()
print(cuisine)
meals = details_container[1].text.strip()
print(meals)
hotel_features = details_container[2].text.strip()
print(hotel_features)
From first html I want cuisine ,meals , retaurant_features content values . But there are some extra content values of hours,average prices.
<div id="RESTAURANT_DETAILS" class="content_block details_block scroll_tabs" data-tab="TABS_DETAILS">
<div class="header_with_improve wrap">
<a href="/UpdateListing-g297595-d6384395-Ocellus-Raipur_Raipur_District_Chhattisgarh.html" onclick="ta.setEvtCookie('UpdateListing', 'entry-detail-moreinfo', null, 0, '/UpdateListingRedesign')">
<div class="improve_listing_btn ui_button primary">Improve this listing</div>
</a>
<h3 class="tabs_header">Restaurant Details</h3> </div>
<div class="details_tab">
<div class="table_section">
<div class="row">
<div class="ratingSummary wrap">
<div class="histogramCommon bubbleHistogram wrap">
<div class="colTitle">
Rating summary
</div>
<ul class="barChart">
<li>
<div class="ratingRow wrap">
<div class="label part ">
<span class="text">Food</span>
</div>
<div class="wrap row part ">
<span class="ui_bubble_rating bubble_45" alt="4.5 of 5 bubbles"></span>
</div>
</div>
<div class="ratingRow wrap">
<div class="label part ">
<span class="text">Service</span>
</div>
<div class="wrap row part ">
<span class="ui_bubble_rating bubble_35" alt="3.5 of 5 bubbles"></span>
</div>
</div>
</li>
<li>
<div class="ratingRow wrap">
<div class="label part ">
<span class="text">Value</span>
</div>
<div class="wrap row part ">
<span class="ui_bubble_rating bubble_35" alt="3.5 of 5 bubbles"></span>
</div>
</div>
</li>
</ul>
</div>
</div>
</div>
<div class="row">
<div class="title">
Average prices
</div>
<div class="content">
<span>₹ 448 -
₹ 768</span>
</div>
</div>
<div class="row">
<div class="title">
Cuisine
</div>
<div class="content">
Indian, Asian, Italian, French, Chinese, International, Vegetarian Friendly
</div>
</div>
<div class="row">
<div class="title">
Meals
</div>
<div class="content">
Breakfast, Lunch, Dinner, Brunch
</div>
</div>
<div class="row">
<div class="title">
Restaurant features
</div>
<div class="content">
Reservations, Seating, Takeout, Private Dining, Waitstaff
</div>
</div>
<div class="row">
<div class="title">
Good for
</div>
<div class="content">
Groups, Business meetings, Child-friendly
</div>
</div>
<div class="row">
<div class="hours title">
Open Hours
</div>
<div class="hours content">
<div class="detail">
<span class="day">Sunday</span>
<span class="hours"><div class="hoursRange">07:00 - 23:00</div></span>
</div>
<div class="detail">
<span class="day">Monday</span>
<span class="hours"><div class="hoursRange">07:00 - 23:00</div></span>
</div>
<div class="detail">
<span class="day">Tuesday</span>
<span class="hours"><div class="hoursRange">07:00 - 23:00</div></span>
</div>
<div class="detail">
<span class="day">Wednesday</span>
<span class="hours"><div class="hoursRange">07:00 - 23:00</div></span>
</div>
<div class="detail">
<span class="day">Thursday</span>
<span class="hours"><div class="hoursRange">07:00 - 23:00</div></span>
</div>
<div class="detail">
<span class="day">Friday</span>
<span class="hours"><div class="hoursRange">07:00 - 23:00</div></span>
</div>
<div class="detail">
<span class="day">Saturday</span>
<span class="hours"><div class="hoursRange">07:00 - 23:00</div></span>
</div>
</div>
</div>
</div>
<div class="additional_info">
<div class="title">
Location and Contact Information </div>
<div class="content">
<ul class="detailsContent">
<li>
<div class="detail">Address:
<span> <span class="format_address"><span class="street-address">G.E. Road</span> | <span class="extended-address">Mayura Hotel</span>, <span class="locality">Raipur 492001, </span><span class="country-name">India</span> </span>
</span>
</div>
</li>
<li>
<div class="detail">Location:
<span> Asia</span>
<span> > India</span>
<span> > Chhattisgarh</span>
<span> > Raipur District</span>
<span> > Raipur</span>
</div>
</li>
<li>
<div class="detail">Phone Number:
<span>+91 77142 00500</span>
</div>
</li>
<li>
<span class="ui_icon email"></span>
<a target="_blank"" href="mailto:banquet#themayurahotels.com" onclick="ta.trackEventOnPage('Eatery_Listing','Email','6384395')">
E-mail </a>
</li>
<!--trkP:waypoint_for_poi_2-->
<!-- PLACEMENT waypoint_for_poi -->
<div id="taplc_waypoint_for_poi_1" class="ppr_rup ppr_priv_waypoint_for_poi" data-placement-name="waypoint_for_poi">
</div>
<!--etk-->
</ul>
</div>
</div>
<!--[if lte IE 9]>
<style>
.details_block .threeColumnList{
height: 350px;
overflow: auto;
}
</style>
<![endif]-->
</div>
</div>
From second html I want cuisine ,meals , retaurant_features content values like above html. But in this extra content values of hours,average prices are not present
<div id="RESTAURANT_DETAILS" class="content_block details_block scroll_tabs" data-tab="TABS_DETAILS">
<div class="header_with_improve wrap">
<a href="/UpdateListing-g297595-d8595502-Barbeque_Nation-Raipur_Raipur_District_Chhattisgarh.html" onclick="ta.setEvtCookie('UpdateListing', 'entry-detail-moreinfo', null, 0, '/UpdateListingRedesign')">
<div class="improve_listing_btn ui_button primary">Improve this listing</div>
</a>
<h3 class="tabs_header">Restaurant Details</h3> </div>
<div class="details_tab">
<div class="table_section">
<div class="row">
<div class="ratingSummary wrap">
<div class="histogramCommon bubbleHistogram wrap">
<div class="colTitle">
Rating summary
</div>
<ul class="barChart">
<li>
<div class="ratingRow wrap">
<div class="label part ">
<span class="text">Food</span>
</div>
<div class="wrap row part ">
<span class="ui_bubble_rating bubble_45" alt="4.5 of 5 bubbles"></span>
</div>
</div>
<div class="ratingRow wrap">
<div class="label part ">
<span class="text">Service</span>
</div>
<div class="wrap row part ">
<span class="ui_bubble_rating bubble_45" alt="4.5 of 5 bubbles"></span>
</div>
</div>
</li>
<li>
<div class="ratingRow wrap">
<div class="label part ">
<span class="text">Value</span>
</div>
<div class="wrap row part ">
<span class="ui_bubble_rating bubble_40" alt="4.0 of 5 bubbles"></span>
</div>
</div>
</li>
</ul>
</div>
</div>
</div>
<div class="row">
<div class="title">
Cuisine
</div>
<div class="content">
Indian, Barbecue, Asian, Vegetarian Friendly, Vegan Options, Gluten Free Options
</div>
</div>
<div class="row">
<div class="title">
Meals
</div>
<div class="content">
Lunch, Dinner
</div>
</div>
<div class="row">
<div class="title">
Restaurant features
</div>
<div class="content">
Reservations, Seating, Waitstaff, Wheelchair Accessible, Validated Parking
</div>
</div>
<div class="row">
<div class="title">
Good for
</div>
<div class="content">
Groups, Special Occasion Dining, Kids, Child-friendly
</div>
</div>
</div>
<div class="additional_info">
<div class="title">
Location and Contact Information </div>
<div class="content">
<ul class="detailsContent">
<li>
<div class="detail">Address:
<span> <span class="format_address"> | <span class="extended-address">Magneto The Mall, 2nd Floor</span>, <span class="locality">Raipur 429010, </span><span class="country-name">India</span> </span>
</span>
</div>
</li>
<li>
<div class="detail">Location:
<span> Asia</span>
<span> > India</span>
<span> > Chhattisgarh</span>
<span> > Raipur District</span>
<span> > Raipur</span>
</div>
</li>
<li>
<div class="detail">Phone Number:
<span>+91 77160 60008</span>
</div>
</li>
<li>
<span class="ui_icon email"></span>
<a target="_blank"" href="mailto:feedback#barbeque-nation.com" onclick="ta.trackEventOnPage('Eatery_Listing','Email','8595502')">
E-mail </a>
</li>
<!--trkP:waypoint_for_poi_2-->
<!-- PLACEMENT waypoint_for_poi -->
<div id="taplc_waypoint_for_poi_1" class="ppr_rup ppr_priv_waypoint_for_poi" data-placement-name="waypoint_for_poi">
</div>
<!--etk-->
</ul>
</div>
</div>
<!--[if lte IE 9]>
<style>
.details_block .threeColumnList{
height: 350px;
overflow: auto;
}
</style>
<![endif]-->
</div>
</div>
Instead of obtaining a list of all <div class="content"> blocks and selecting several by their index (which is changing from the first page to the second), you can find all <div class="row">, which contain a title and the respective content.
rows = details_container.findAll('div', {'class': 'row'})
# used to store data extracted from HTML <div class="row"> elements
data = {}
for row in rows:
title = row.find('div', {'class': 'title'})
content = row.find('div', {'class': 'content'})
if title and content:
# here I am just formatting the dict key to be more python-ish. totally optional
title = title.text.strip().lower().replace(' ', '-')
data[title] = content
# tested with the HTML from the first page
print data.keys()
#=> [u'cuisine', u'restaurant-features', u'average-prices', u'good-for', u'open-hours', u'meals']
print type(data['cuisine'])
#=> <class 'bs4.element.Tag'>
Now you can extract the content items from the HTML webpage without caring what order they appear in. This code should work on any HTML that has the same general structure as the two pages you provided. I hope this helps!
Related
I'm trying to click a button but its being really difficult. More precise, I want to do it for several buttons. I have tried several options but none is working properly. Here is the html code for the button:
<button class="rlg-trade__action rlg-trade__bump --bump " type="button" data-alias="bd520a66-cc88-4af8-ba92-30111bbdbd02" data-preventtext="Bumping…">
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><g style="stroke-linecap:round;stroke-linejoin:round;stroke:#6a717f;fill:none;stroke-miterlimit:10" transform="translate(.5 .5)"><path d="m12 23v-13"></path><path d="m16 14-4-4-4 4"></path><g stroke="#6a717f"><path d="m4 17h-3v-16h22v16h-3"></path><path d="m1 5h22"></path></g></g></svg>
<span>Bump</span>
</button>
And here is the code for scrapping:
trades_column = self.driver.find_element_by_css_selector('.rlg-trading__intersect') #Section where all trades are listed
trades_list = trades_column.find_elements(By.CLASS_NAME,'rlg-trade') #Search each trade element, all of them include the bump button posted above.
for trades in trades_list:
bump = trades.find_element_by_css_selector('.rlg-trade__action.rlg-trade__bump.--bump').click()
print('Trade bumped successfully!')
time.sleep(1)
self.driver.find_element_by_css_selector('i.fa').click() #This is a click in the page to exit a box that appear after the click.
Have tried by xpath, not working either. Any help would be appreciated.
Output error: selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"button.rlg-trade__action.rlg-trade__bump.--bump"}
<div class="rlg-trade" data-i="0"> its every item in the trade list section rlg-trading__intersect.
<div class="col-3-3 rlg-trading__intersect">
<div class="rlg-trade" data-i="0" style="">
<header class="rlg-trade__header
">
<a href="/player/isaacdl" class="rlg-trade__user">
<div class="rlg-trade__avatar"><img loading="lazy" class="rlg-trade__avatarimage" src="/content/media/users/avatar/128px/b35468f2331659986325.png" alt="isaacdl"></div>
<div class="rlg-trade__meta">
<div class="rlg-trade__username">
isaacdl
</div>
<span class="rlg-trade__info">
<span class="rlg-trade__time">
<span>5 hours ago</span>
<span>5 hours, 20 minutes, 40 seconds ago</span>
</span>
<span class="rlg-trade__delinfo">·
This trade will be deleted in <strong class="rlg-trade__timeleft">14 days</strong> if you don't bump it.</span>
</span>
</div>
</a>
<div class="rlg-trade__platforms">
<a target="_blank" rel="noopener" class="rlg-trade__platform" style="order: 100;" href="https://steamcommunity.com/profiles/76561198120028164" onclick="event.preventDefault();phishingAware('https://steamcommunity.com/profiles/76561198120028164');">
<img class="rlg-trade__platformlogo" src="https://static.rocket-league.com/assets/b81c8860521ff08c3d8194c2eca3491c1b158f13/images/logos/windowspc_black.svg" alt="Windows PC">
<div class="rlg-trade__platformname">
<span>Add on
Steam
</span>
<span>
auchan </span>
</div>
</a>
</div>
</header>
<div class="rlg-trade__content">
<div class="rlg-trade__labels">
<div class="rlg-trade__haslabel">
Has
</div>
<div class="rlg-trade__wantslabel">
Wants
</div>
</div>
<div class="rlg-trade__items">
<div class="rlg-trade__itemshas ">
<div class="rlg-item --very-rare --hover">
<div class="rlg-item__gradient --very-rare"></div>
<img loading="lazy" class="rlg-item__image" src="/content/media/items/avatar/220px/a67e907fb81451699877.png" alt="Cristiano ">
<div class="rlg-item__text">
<h2 class="rlg-item__name">Cristiano</h2>
</div>
<div class="rlg-item-links">
<a class="rlg-btn-primary --small" href="/items/wheels/cristiano">Item details</a>
<a class="rlg-btn-secondary --small" href="/trading/?filterItem=148&filterCertification=0&filterPaint=0&filterPlatform=0&filterItemType=1">Find trades</a>
</div>
</div>
<div class="rlg-item --very-rare --hover">
<div class="rlg-item__gradient --very-rare"></div>
<img loading="lazy" class="rlg-item__image" src="/content/media/items/avatar/220px/a67e907fb81451699877.png" alt="Cristiano ">
<div class="rlg-item__text">
<h2 class="rlg-item__name">Cristiano</h2>
</div>
<div class="rlg-item-links">
<a class="rlg-btn-primary --small" href="/items/wheels/cristiano">Item details</a>
<a class="rlg-btn-secondary --small" href="/trading/?filterItem=148&filterCertification=0&filterPaint=0&filterPlatform=0&filterItemType=1">Find trades</a>
</div>
</div>
</div>
<div class="rlg-trade__wantslabel rlg-trade__wantslabelalt">
Wants
</div>
<div class="rlg-trade__itemswants ">
<div class="rlg-item --premium --hover">
<div class="rlg-item__gradient --premium"></div>
<img loading="lazy" class="rlg-item__image" src="/content/media/items/avatar/220px/da6ecd87091575484054.png" alt="Credits ">
<div class="rlg-item__text">
<h2 class="rlg-item__name">Credits</h2>
</div>
<div class="rlg-item__quantity --quantity-80 --premium">
80 </div>
<div class="rlg-item-links">
<a class="rlg-btn-primary --small" href="/items/misc/credits">Item details</a>
<a class="rlg-btn-secondary --small" href="/trading/?filterItem=2615&filterCertification=0&filterPaint=0&filterPlatform=0&filterItemType=1">Find trades</a>
</div>
</div>
<div class="rlg-item --premium --hover">
<div class="rlg-item__gradient --premium"></div>
<img loading="lazy" class="rlg-item__image" src="/content/media/items/avatar/220px/da6ecd87091575484054.png" alt="Credits ">
<div class="rlg-item__text">
<h2 class="rlg-item__name">Credits</h2>
</div>
<div class="rlg-item__quantity --quantity-80 --premium">
80 </div>
<div class="rlg-item-links">
<a class="rlg-btn-primary --small" href="/items/misc/credits">Item details</a>
<a class="rlg-btn-secondary --small" href="/trading/?filterItem=2615&filterCertification=0&filterPaint=0&filterPlatform=0&filterItemType=1">Find trades</a>
</div>
</div>
</div>
</div>
</div>
<div class="rlg-trade__actions">
<button class="rlg-trade__action rlg-trade__bump --bump " type="button" data-alias="bd520a66-cc88-4af8-ba92-30111bbdbd02" data-preventtext="Bumping…">
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><g style="stroke-linecap:round;stroke-linejoin:round;stroke:#6a717f;fill:none;stroke-miterlimit:10" transform="translate(.5 .5)"><path d="m12 23v-13"></path><path d="m16 14-4-4-4 4"></path><g stroke="#6a717f"><path d="m4 17h-3v-16h22v16h-3"></path><path d="m1 5h22"></path></g></g></svg>
<span>Bump</span>
</button>
<a class="rlg-trade__action rlg-trade__edit --edit" href="/trade/edit?trade=bd520a66-cc88-4af8-ba92-30111bbdbd02">
<svg viewBox="0 0 12 12" xmlns="http://www.w3.org/2000/svg"><g fill="none" stroke="#6a717f" stroke-linecap="round" stroke-linejoin="round"><path d="m9.5.5 2 2-5 5-3 1 1-3z"></path><path d="m10.5 8.5v2a1 1 0 0 1 -1 1h-8a1 1 0 0 1 -1-1v-8a1 1 0 0 1 1-1h2" stroke="#6a717f"></path></g></svg>
<span>Edit trade</span>
</a>
<a class="rlg-trade__action rlg-trade__disable --disable" href="/functions/disableTrade.php?trade=bd520a66-cc88-4af8-ba92-30111bbdbd02" onclick="return confirm('Are you sure you want to disable this trade? This trade will be permanently removed.')">
<svg height="20" viewBox="0 0 24 24" width="20" xmlns="http://www.w3.org/2000/svg"><g style="stroke-linecap:round;stroke-linejoin:round;stroke-width:1.2;fill:none;stroke:#6a717f;stroke-miterlimit:10"><path d="m20.3 4.7-15.6 15.6"></path><circle cx="12.5" cy="12.5" r="11"></circle></g></svg> <span>Disable trade</span>
</a>
<a href="/trade/bd520a66-cc88-4af8-ba92-30111bbdbd02" class="rlg-trade__action --comments">
<svg viewBox="0 0 18 21" xmlns="http://www.w3.org/2000/svg"><g fill="none" fill-rule="evenodd" stroke="#545454" stroke-linecap="round" stroke-linejoin="round"><path d="m1.421053 1.421053h15.157895v18.526316h-15.157895z"></path><path d="m4.789474 4.789474h8.421053v5.052632h-8.421053z"></path><path d="m4.789474 13.210526h8.421052"></path><path d="m4.789474 16.578947h8.421052"></path></g></svg>
<span>Comments</span>
</a>
</div>
<div class="rlg-trade__note">
Have 2x of them. Please fast. </div>
<button class="rlg-trade__noteexpand" style="display: none;">Show full trade description</button>
</div>
<div class="rlg-trade rlg-trade-placeholder" data-i="1" style="height: 323px; box-shadow: none;"></div>
<div class="rlg-trade rlg-trade-placeholder" data-i="2" style="height: 323px; box-shadow: none;"></div>
<div class="rlg-trade rlg-trade-placeholder" data-i="3" style="height: 323px; box-shadow: none;"></div>
</div>
Try use following xpath to click on the element.
for trades in trades_list:
trades.find_element(By.XPATH,'.//button[.//span[text()="Bump"]]').click()
print('Trade bumped successfully!')
time.sleep(1)
I would like to be able to click on any part of a row that meets 2 criteria:
timeslot that I set (ie 5:30 - 6:30 pm)
can filter based on value Co-Ed or Women's Only
Unsure of how to filter these values. I though maybe searching the values of the row list
timeslot = driver.find_elements_by_class_name("row.c-schedule-calendar__class-schedule-listitem-wrapper.c-schedule-calendar__workout-schedule-list-item")
Below is the html code
<li tabindex="0" class="row c-schedule-calendar__class-schedule-listitem-wrapper c-schedule-calendar__workout-schedule-list-item" data-index="22" data-workout-id="156986" data-club-id="204">
<div class="col-md-12 col-lg-2 time-duration clickable js-single-class-list-item">
<span class="js-class-time">7:45 pm</span> <span class="dot-separator">-</span> <span class="js-class-duration">8:45 pm</span>
</div>
<div class="col-md-12 col-lg-10">
<div class="row">
<div class="col-8 clickable js-single-class-list-item">
<div class="row">
<div class="col-md-12 col-lg-6">
<div class="class-name"><span class="js-class-name">General Workout Area</span></div>
<div class="class-short-info">
<span class="js-class-type">Co-ed</span>
</div>
</div>
<div class="col-md-12 col-lg-6">
<div class="class-address-wrapper">
<span class="class-address">
<svg viewBox="0 0 48 48" class="c-search-class-filter--location__pin-icon">
<use xlink:href="/etc.clientlibs/goodlife/clientlibs/clientlib-site/resources/images/icons.svg#ic_pin_default"></use>
</svg> Cambridge Hespeler And Eagle
</span>
</div>
</div>
</div>
</div>
<div class="col-4 c-schedule-calendar__cta-container">
<div class="js-class-action-container">
<button class="c-btn-outlined class-action" data-class-action="book-class" data-class-action-step="class-action-confirmation" data-workout-id="156986" data-club-id="204" data-waitlistable="false"><span class="c-btn__label">Book</span></button>
</div>
<div class="available-spots">
<div class="js-available-spots-container">
2 spots left
</div>
</div>
</div>
</div>
</div></li>
time_range = "5:30 pm - 6:30 pm"
category = "Co-Ed"
"//li[descendant::text()='{}' and descendant::span[text()='{}'] and descendant::span[text()='{}']]".format(time_range.split(' - ')[0],time_range.split(' - ')[1], category)
This xpath should work, so instead of searching by class element use
driver.find_element_by_xpath("//li[descendant::text()='{}' and descendant::span[text()='{}'] and descendant::span[text()='{}']]".format(time_range.split(' - ')[0],time_range.split(' - ')[1], category)).click()
Problem Introduction
Language version: Python 3.8
Operating System: Windows 10
Other relevant software: Jupyter notebook and html-requests
Context:
I have been following along with this tutorial to scrape stackoverflow for questions. My goal is to extract the answers (from the url of the question) and who answered it. However, I am having difficulty determining what classes/id's to search for in the html of a question
Things I have tried:
I have attempted searching under ('.container') for things like ('.post-layout'), '.mb0', '#answers', and'#answers-headers' with marginal, cluttered, success.
An excerpt from the code I am using to parse the pages(not the questions) here is the github link:
def parse_tagged_page(html):
question_summaries = html.find(".question-summary")
key_names = ['question', 'votes', 'tags']
classes_needed = ['.question-hyperlink', '.vote', '.tags']
datas = []
for q_el in question_summaries:
question_data = {}
for i, _class in enumerate(classes_needed):
sub_el = q_el.find(_class, first=True)
keyname = key_names[i]
question_data[keyname] = clean_scraped_data(sub_el.text, keyname=keyname)
datas.append(question_data)
return datas
An example of the html code I am looking for is below.
html code on this question:
<div id="answers">
<a name="tab-top"></a>
<div id="answers-header">
<div class="answers-subheader grid ai-center mb8">
<div class="grid--cell fl1">
<h2 class="mb0" data-answercount="13">
13 Answers
<span style="display:none;" itemprop="answerCount">13</span>
</h2>
</div>
<div class="grid--cell">
<div class=" grid s-btn-group js-filter-btn">
<a class="grid--cell s-btn s-btn__muted s-btn__outlined" href="/questions/19254583/how-do-i-host-multiple-node-js-sites-on-the-same-ip-server-with-different-domain?answertab=active#tab-top" data-nav-xhref="" title="Answers with the latest activity first" data-value="active" data-shortcut="A">
Active</a>
<a class="grid--cell s-btn s-btn__muted s-btn__outlined" href="/questions/19254583/how-do-i-host-multiple-node-js-sites-on-the-same-ip-server-with-different-domain?answertab=oldest#tab-top" data-nav-xhref="" title="Answers in the order they were provided" data-value="oldest" data-shortcut="O">
Oldest</a>
<a class="youarehere is-selected grid--cell s-btn s-btn__muted s-btn__outlined" href="/questions/19254583/how-do-i-host-multiple-node-js-sites-on-the-same-ip-server-with-different-domain?answertab=votes#tab-top" data-nav-xhref="" title="Answers with the highest score first" data-value="votes" data-shortcut="V">
Votes</a>
</div>
</div>
</div>
</div>
<a name="19254824"></a>
<div id="answer-19254824" class="answer accepted-answer" data-answerid="19254824" itemprop="acceptedAnswer" itemscope="" itemtype="http://schema.org/Answer">
<div class="post-layout">
<div class="votecell post-layout--left">
<div class="js-voting-container grid fd-column ai-stretch gs4 fc-black-200" data-post-id="19254824">
<button class="js-vote-up-btn grid--cell s-btn s-btn__unset c-pointer" data-controller="s-tooltip" data-s-tooltip-placement="right" aria-pressed="false" aria-label="Up vote" data-selected-classes="fc-theme-primary" aria-describedby="--stacks-s-tooltip-zxmm3912"><svg aria-hidden="true" class="m0 svg-icon iconArrowUpLg" width="36" height="36" viewBox="0 0 36 36"><path d="M2 26h32L18 10 2 26z"></path></svg></button><div id="--stacks-s-tooltip-zxmm3912" class="s-popover s-popover__tooltip pe-none" aria-hidden="true" role="tooltip">This answer is useful<div class="s-popover--arrow"></div></div>
<div class="js-vote-count grid--cell fc-black-500 fs-title grid fd-column ai-center" itemprop="upvoteCount" data-value="83">83</div>
<button class="js-vote-down-btn grid--cell s-btn s-btn__unset c-pointer" data-controller="s-tooltip" data-s-tooltip-placement="right" aria-pressed="false" aria-label="Down vote" data-selected-classes="fc-theme-primary" aria-describedby="--stacks-s-tooltip-waz8801n"><svg aria-hidden="true" class="m0 svg-icon iconArrowDownLg" width="36" height="36" viewBox="0 0 36 36"><path d="M2 10h32L18 26 2 10z"></path></svg></button><div id="--stacks-s-tooltip-waz8801n" class="s-popover s-popover__tooltip pe-none" aria-hidden="true" role="tooltip">This answer is not useful<div class="s-popover--arrow"></div></div>
<div class="js-accepted-answer-indicator grid--cell fc-green-500 ta-center py4" data-s-tooltip-placement="right" title="Loading when this answer was accepted…" tabindex="0" role="note" aria-label="Accepted">
<svg aria-hidden="true" class="svg-icon iconCheckmarkLg" width="36" height="36" viewBox="0 0 36 36"><path d="M6 14l8 8L30 6v8L14 30l-8-8v-8z"></path></svg>
</div>
<a class="js-post-issue grid--cell s-btn s-btn__unset c-pointer py6 mx-auto" href="/posts/19254824/timeline" data-shortcut="T" data-controller="s-tooltip" data-s-tooltip-placement="right" aria-label="Timeline" aria-describedby="--stacks-s-tooltip-djt8qt69"><svg aria-hidden="true" class="mln2 mr0 svg-icon iconHistory" width="19" height="18" viewBox="0 0 19 18"><path d="M3 9a8 8 0 113.73 6.77L8.2 14.3A6 6 0 105 9l3.01-.01-4 4-4-4h3L3 9zm7-4h1.01L11 9.36l3.22 2.1-.6.93L10 10V5z"></path></svg></a><div id="--stacks-s-tooltip-djt8qt69" class="s-popover s-popover__tooltip pe-none" aria-hidden="true" role="tooltip">Show activity on this post.<div class="s-popover--arrow"></div></div>
</div>
</div>
<div class="answercell post-layout--right">
<div class="s-prose js-post-body" itemprop="text">
<p>Choose one of:</p>
<ul>
<li>Use some other server (like nginx) as a reverse proxy.</li>
<li>Use node-http-proxy as a reverse proxy.</li>
<li>Use the vhost middleware if each domain can be served from the same Connect/Express codebase and node.js instance.</li>
</ul>
</div>
<div class="mt24">
<div class="grid fw-wrap ai-start jc-end gs8 gsy">
<time itemprop="dateCreated" datetime="2013-10-08T17:53:13"></time>
<div class="grid--cell mr16" style="flex: 1 1 100px;">
<div class="post-menu">
share<div class="s-popover z-dropdown" style="width: unset; max-width: 28em;" id="se-share-sheet-1"><div class="s-popover--arrow"></div><div><span class="js-title fw-bold">Share a link to this answer</span> <span class="js-subtitle">(includes your user id)</span></div><div class="my8"><input type="text" class="js-input s-input wmn3 sm:wmn-initial" readonly=""></div><div class="d-flex jc-space-between mbn4"><button class="js-copy-link-btn s-btn s-btn__link">Copy link</button>CC BY-SA 3.0<div class="js-social-container"></div></div></div>
<span class="lsep">|</span>
edit
<span class="lsep">|</span>
<button id="btnFollowPost-19254824" class="s-btn s-btn__link fc-black-400 h:fc-black-700 pb2 js-follow-post js-follow-answer js-gps-track" role="button" data-gps-track="post.click({ item: 14, priv: -1, post_type: 2 })" data-controller="s-tooltip " data-s-tooltip-placement="bottom" data-s-popover-placement="bottom" aria-controls="" aria-describedby="--stacks-s-tooltip-nb9azr0k">
follow
</button><div id="--stacks-s-tooltip-nb9azr0k" class="s-popover s-popover__tooltip pe-none" aria-hidden="true" role="tooltip">Follow this answer to receive notifications<div class="s-popover--arrow"></div></div>
<span class="lsep">|</span>
</div>
</div>
<div class="post-signature grid--cell fl0">
<div class="user-info user-hover">
<div class="user-action-time">
edited <span title="2017-05-23 11:33:25Z" class="relativetime">May 23 '17 at 11:33</span>
</div>
<div class="user-gravatar32">
<div class="gravatar-wrapper-32"><img src="https://www.gravatar.com/avatar/a007be5a61f6aa8f3e85ae2fc18dd66e?s=32&d=identicon&r=PG" alt="" width="32" height="32" class="bar-sm"></div>
</div>
<div class="user-details">
Community<span class="mod-flair " title="moderator">♦</span>
<div class="-flair">
<span class="reputation-score" title="reputation score " dir="ltr">1</span><span title="1 silver badge" aria-hidden="true"><span class="badge2"></span><span class="badgecount">1</span></span><span class="v-visible-sr">1 silver badge</span>
</div>
</div>
</div> </div>
<div class="post-signature grid--cell fl0">
<div class="user-info user-hover">
<div class="user-action-time">
answered <span title="2013-10-08 17:53:13Z" class="relativetime">Oct 8 '13 at 17:53</span>
</div>
<div class="user-gravatar32">
<div class="gravatar-wrapper-32"><img src="https://i.stack.imgur.com/eLXTL.jpg?s=32&g=1" alt="" width="32" height="32" class="bar-sm"></div>
</div>
<div class="user-details" itemprop="author" itemscope="" itemtype="http://schema.org/Person">
josh3736<span class="d-none" itemprop="name">josh3736</span>
<div class="-flair">
<span class="reputation-score" title="reputation score 119,818" dir="ltr">120k</span><span title="24 gold badges" aria-hidden="true"><span class="badge1"></span><span class="badgecount">24</span></span><span class="v-visible-sr">24 gold badges</span><span title="198 silver badges" aria-hidden="true"><span class="badge2"></span><span class="badgecount">198</span></span><span class="v-visible-sr">198 silver badges</span><span title="245 bronze badges" aria-hidden="true"><span class="badge3"></span><span class="badgecount">245</span></span><span class="v-visible-sr">245 bronze badges</span>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="post-layout--right">
<div id="comments-19254824" class="comments js-comments-container bt bc-black-2 mt12 " data-post-id="19254824" data-min-length="15">
<ul class="comments-list js-comments-list" data-remaining-comments-count="0" data-canpost="false" data-cansee="true" data-comments-unavailable="false" data-addlink-disabled="true">
<li id="comment-45028507" class="comment js-comment " data-comment-id="45028507">
<div class="js-comment-actions comment-actions">
<div class="comment-score js-comment-edit-hide">
<span title="number of 'useful comment' votes received" class="cool">3</span>
</div>
</div>
<div class="comment-text js-comment-text-and-form">
<div class="comment-body js-comment-edit-hide">
<span class="comment-copy">that's a very good and brief list of the options I've read elsewhere. Do you happen to know for each of these solutions which processes would need to be restarted when a new domain is added? For 1) none. For 2) only the node-http-proxy. For 3) the entire thread of all sites would need to be restarted. Is this correct?</span>
– Flion
<span class="comment-date" dir="ltr"><a class="comment-link" href="#comment45028507_19254824"><span title="2015-02-05 10:48:37Z, License: CC BY-SA 3.0" class="relativetime-clean">Feb 5 '15 at 10:48</span></a></span>
</div>
</div>
</li>
<li id="comment-45045094" class="comment js-comment " data-comment-id="45045094">
<div class="js-comment-actions comment-actions">
<div class="comment-score js-comment-edit-hide">
<span title="number of 'useful comment' votes received" class="cool">1</span>
</div>
</div>
<div class="comment-text js-comment-text-and-form">
<div class="comment-body js-comment-edit-hide">
<span class="comment-copy">#Flion: You could write the node-based proxies in such a way that you could reload the domain configuration without requiring a process restart. It really depends on your app's exact requirements.</span>
– josh3736
<span class="comment-date" dir="ltr"><a class="comment-link" href="#comment45045094_19254824"><span title="2015-02-05 17:50:17Z, License: CC BY-SA 3.0" class="relativetime-clean">Feb 5 '15 at 17:50</span></a></span>
</div>
</div>
</li>
<li id="comment-107457123" class="comment js-comment " data-comment-id="107457123">
<div class="js-comment-actions comment-actions">
<div class="comment-score js-comment-edit-hide">
</div>
</div>
<div class="comment-text js-comment-text-and-form">
<div class="comment-body js-comment-edit-hide">
<span class="comment-copy">Not what was asked.</span>
– Patrick Sturm
<span class="comment-date" dir="ltr"><a class="comment-link" href="#comment107457123_19254824"><span title="2020-03-18 07:47:44Z, License: CC BY-SA 4.0" class="relativetime-clean">Mar 18 at 7:47</span></a></span>
</div>
</div>
</li>
</ul>
</div>
<div id="comments-link-19254824" data-rep="50" data-reg="true">
<a class="js-add-link comments-link disabled-link" title="Use comments to ask for more information or suggest improvements. Avoid comments like “+1” or “thanks”." href="#" role="button">add a comment</a>
<span class="js-link-separator dno"> | </span>
<a class="js-show-link comments-link dno" title="expand to show all comments on this post" href="#" onclick="" role="button"></a>
</div>
</div>
</div>
</div>
You should look for .answercell class
I have the following html code:
<div class="xyOfqd">
<div class="aAAD">
<div class="Bgbcca">Updated</div>
<span class="hthtb">
<div>
<span class="hthtb">September 30, 2018</span>
</div>
</span>
</div>
<div class="aAAD">
<div class="Bgbcca">Text1</div>
<span class="hthtb">
<div><span class="hthtb">Text2</span></div>
</span>
</div>
<div
class="aAAD">
<div class="Bgbcca">MyText</div>
<span class="hthtb">
<div>
<span class="hthtb">Text3</span>
</div>
</span>
</div>
<div class="aAAD">
<div class="Bgbcca">Text4</div>
<span class="hthtb">
<div><span
class="hthtb">Text5</span></div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">Text6</div>
<span class="hthtb">
<div><span
class="hthtb">Text7</span></div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">
Text8/div>
<span class="hthtb">
<div>
<span class="hthtb">
<div>Text9</div>
<div>Text10</div>
</span>
</div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">Text11</div>
<span class="hthtb">
<div><span class="hthtb">Text12</span></div>
</span>
</div>
How can I find Text3 which is located right after the div element with the string of MyText?
You can use lxml.html solution:
from lxml import html
source = """
<div class="xyOfqd">
<div class="aAAD">
<div class="Bgbcca">Updated</div>
...
<span class="hthtb">
<div><span class="hthtb">Text12</span></div>
</span>
</div>"""
tree = html.fromstring(source)
print(tree.xpath('//div[.="MyText"]/following-sibling::span/div/span/text()'))
Only if your structure is the final one, you can have the right value doing this:
from bs4 import BeautifulSoup as bfs
html = """<div class="xyOfqd">
<div class="aAAD">
<div class="Bgbcca">Updated</div>
<span class="hthtb">
<div>
<span class="hthtb">September 30, 2018</span>
</div>
</span>
</div>
<div class="aAAD">
<div class="Bgbcca">Text1</div>
<span class="hthtb">
<div><span class="hthtb">Text2</span></div>
</span>
</div>
<div
class="aAAD">
<div class="Bgbcca">MyText</div>
<span class="hthtb">
<div>
<span class="hthtb">Text3</span>
</div>
</span>
</div>
<div class="aAAD">
<div class="Bgbcca">Text4</div>
<span class="hthtb">
<div><span
class="hthtb">Text5</span></div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">Text6</div>
<span class="hthtb">
<div><span
class="hthtb">Text7</span></div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">
Text8/div>
<span class="hthtb">
<div>
<span class="hthtb">
<div>Text9</div>
<div>Text10</div>
</span>
</div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">Text11</div>
<span class="hthtb">
<div><span class="hthtb">Text12</span></div>
</span>
</div>"""
soup = bfs(html, 'html.parser')
result = ''
for div0 in soup.find_all('div',{'class':'aAAD'}):
for div1 in div0.find_all('div', {'class':'Bgbcca'}):
if div1.get_text() == 'MyText':
span = div0.find('span',{'class':'hthtb'})
if span:
span_to_return = span.find('span',{'class':'hthtb'})
if span_to_return:
result = span_to_return.get_text()
print(result)
You can build a custom query function to pass into find():
def has_my_text(tag):
found = tag.select_one('.Bgbcca')
# important to assign the result to avoid calling
# .get_text() on a NoneType, resulting in an error.
if found:
return found.get_text() == "MyText"
soup = bs4.... # assign your soup object
found = soup.find(has_my_text)
# <div class="Bgbcca">MyText</div>
# <span class="hthtb">
# <div>
# <span class="hthtb">Text3</span>
# </div>
# </span>
# </div>
# Note your span class is nested so we go two level in
result = found.select_one('.hthtb').select_one('.hthtb').get_text()
# 'Text3'
# This below also works if your other span are always empty texts
result = found.select_one('.hthtb').get_text().strip()
Note, the find() and select_one assume we only need the first match found. If you need to handle multiple matches, you'll need to use find_all() and select() and make changes to your code accordingly.
If you want to handle variable texts, you can define your function like this:
def has_my_text(tag, text):
found = tag.select_one('.Bgbcca')
if found:
return found.get_text() == text
And wrap the function in your find() like this:
txt = "MyText"
soup.find(lambda tag: has_my_text(tag, txt))
I am new to Python, I need to get title, isbn, price and publication date for my very first crawler.
<div class="col-md-7 col-sm-7">
<h4>Pocket Anatomy and Physiology, 3rd Edition</h4>
<div>Shirley A. Jones</div>
<div>ISBN-13: 978-0-8036-5658-1</div>
<p class="price"> $39.95 (US)</p>
<div class="prd_lst">
<ul class="book_list">
</ul>
<div class="mobile_add_tocart">
<button type="button" class="addtocart" onclick="window.location.href='https://shoppingcart.fadavis.com/ShoppingCart/AddToCart?guid=74779e63-ccfb-454e-a6b9-b4e9f9a50793&productid=10959&applicationid=5'"> <span class="cart_icon sprite pull-left"></span>Add to Cart</button>
</div>
<div class="popover bottom Available_tooltip"><div class="arrow"></div>
<div class="popover-content">
<ul class="book_list">
</ul>
<div class="clearfix"></div>
</div>
</div>
<div class="clearfix"></div>
</div>
<p>Publication Date: 10/12/2016</p>
<div class="available active">
<div class="available_icon sprite pull-left"></div>
Available</div>
</div>
import bs4
html = """
<div class="col-md-7 col-sm-7">
<h4>Pocket Anatomy and Physiology, 3rd Edition</h4>
<div>Shirley A. Jones</div>
<div>ISBN-13: 978-0-8036-5658-1</div>
<p class="price"> $39.95 (US)</p>
<div class="prd_lst">
<ul class="book_list">
</ul>
<div class="mobile_add_tocart">
<button type="button" class="addtocart" onclick="window.location.href='https://shoppingcart.fadavis.com/ShoppingCart/AddToCart?guid=74779e63-ccfb-454e-a6b9-b4e9f9a50793&productid=10959&applicationid=5'"> <span class="cart_icon sprite pull-left"></span>Add to Cart</button>
</div>
<div class="popover bottom Available_tooltip"><div class="arrow"></div>
<div class="popover-content">
<ul class="book_list">
</ul>
<div class="clearfix"></div>
</div>
</div>
<div class="clearfix"></div>
</div>
<p>Publication Date: 10/12/2016</p>
<div class="available active">
<div class="available_icon sprite pull-left"></div>
Available</div>
</div>
</div>
"""
soup=bs4.BeautifulSoup(html,'lxml')
div = soup.find('div', {'class': 'col-md-7'})
divs = div.findAll('div')
price = div.find('p', {'class': 'price'})
date = div.findAll('p')
print(divs[0].text)
print(divs[1].text)
print(price.text)
print(date[-1].text)
Output
Shirley A. Jones
ISBN-13: 978-0-8036-5658-1
$39.95 (US)
Publication Date: 10/12/2016