from bs4 import BeautifulSoup
html_content = """<div id="formContents" class="dformDisplay ">
<div class="sectionDiv expanded">
<table id="sect_s1" class="formSection LabelsAbove">
<tr class="formRow ">
<td id="tdl_8" class="label lc" >
<label class="fieldLabel " ><b >Address</b></label>
<table class="EmailFieldPadder" border="0" cellspacing="0" cellpadding="0" valign="top" style="width:98%;margin-top:.3em;margin-right:1.5em;">
<tr><td class="EmailDivWrapper" style="background-color:#f5f5f5;padding: 0.83em;border-radius:3px;margin:0;border:0px;">
<div id="tdf_8" class="cell cc" >
<a
href="https://maps.google.com/?q=1183+Pelham+Wood+Dr%2C+Rock+Hill%2C+SC+29732">1183
Pelham Wood Dr, Rock Hill, SC 29732</a>
</span></div>
</td></tr></table>
</td>
"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
form_data = soup.find("div",{"id":"formContents"})
if form_data:
section_data = soup.findAll("div",{"class":"sectionDiv expanded"})
for datas in section_data:
labels = datas.findAll("label",{"class":"fieldLabel"})
for item in labels:
labels = item.text
print(labels)
entity_data = item.findAll("td").text
print(entity_data)
except Exception as e:
print(e)
My required output:
Address : 183 Pelham Wood Dr, Rock Hill, SC 29732.
Is there any solution to get the particular output using beautifulsoup. I need to the address of the particular HTML source content.
In newer code avoid old syntax findAll() instead use find_all() or select() with css selectors - For more take a minute to check docs
You could select all <td> with <label> in your element and use stripped_strings to extract the contents - In case it is the same motive as in How to scrape data from the website which is not aligned properly you could get a nicly structured dict of label and text
dict(e.stripped_strings for e in soup.select('#formContents td:has(label)'))
or this if it is close to the requirements from How to extract the data from the html content:
dict((e.text,e.find_next('td').get_text(strip=True)) for e in soup.select('label'))
Example
from bs4 import BeautifulSoup
html_content = """<div id="formContents" class="dformDisplay ">
<div class="sectionDiv expanded">
<div class="Title expanded ToggleSection shead"
style="margin-top:1em"
id="sect_s11Header">
<div><!--The div around the table is so that the toggling can be animated smoothly-->
<table id="sect_s1" class="formSection LabelsAbove">
<tr class="formRow ">
<td id="tdl_8" class="label lc" >
<label class="fieldLabel " ><b >Address</b></label>
<table class="EmailFieldPadder" border="0" cellspacing="0" cellpadding="0" valign="top" style="width:98%;margin-top:.3em;margin-right:1.5em;">
<tr><td class="EmailDivWrapper" style="background-color:#f5f5f5;padding: 0.83em;border-radius:3px;margin:0;border:0px;">
<div id="tdf_8" class="cell cc" >
<a
href="https://maps.google.com/?q=1183+Pelham+Wood+Dr%2C+Rock+Hill%2C+SC+29732">1183
Pelham Wood Dr, Rock Hill, SC 29732</a>
</span></div>
</td></tr></table>
</td>
"""
soup = BeautifulSoup(html_content)
dict(e.stripped_strings for e in soup.select('#formContents td:has(label)'))
Output
{'Address': '1183\nPelham Wood Dr, Rock Hill, SC 29732'}
You can search for a tag where href starts with https://maps.google.com:
>>> soup.find('a', {'href': re.compile('^https://maps.google.com')}).text.replace('\n', ' ')
'1183 Pelham Wood Dr, Rock Hill, SC 29732'
The important thing here is not the soup object used but the strategy with a regexp to extract the address text from the tag.
When I try your code, it prints
Address
ResultSet object has no attribute 'text'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
You should take note of the second line, because item.findAll("td").text will always raise error; you can instead do something like '\n'.join([td.text for td in item.findAll("td")]) and that should not raise any error.
However, it will return only an empty string [as item.findAll("td") is an empty ResultSet] because with for item in labels....item.findAll("td")..., you're looking for td tags inside the label tags when they're actually in a table tag next to the label.
Solution 1: Using .find_next_siblings
soup = BeautifulSoup(html_content, 'html.parser')
form_data = soup.find("div",{"id":"formContents"})
if form_data:
section_data = soup.find_all("div",{"class":"sectionDiv expanded"})
for datas in section_data:
labels = datas.find_all("label",{"class":"fieldLabel"})
for item in labels:
print(item.text) ## label
for nxtTable in item.find_next_siblings('table'):
print('\n'.join([td.text for td in nxtTable.find_all("td")]))
break ## [ only takes the first table ]
[Like this, you shouldn't need the try...except either.]
And [for me] that printed
Address
1183
Pelham Wood Dr, Rock Hill, SC 29732
Solution 2: Using .select with CSS selectors
soup = BeautifulSoup(html_content, 'html.parser')
section_sel = 'div#formContents div.sectionDiv.expanded'
label_sel = 'label.fieldLabel'
for datas in soup.select(f'{section_sel}:has({label_sel}+table td)'):
labels = datas.select(f'{label_sel}:has(+table td)')
labels = [' '.join(l.get_text(' ').split()) for l in labels]
entity_data = [' '.join([
' '.join(td.get_text(' ').split()) for td in ed.select('td')
]) for ed in datas.select(f'{label_sel}+table:has(td)')]
# data_dict = dict(zip(labels, entity_data))
for l, ed in zip(labels, entity_data): print(f'{l}: {ed}')
And that should print
Address: 1183 Pelham Wood Dr, Rock Hill, SC 29732
Btw, dict(zip(labels, entity_data)) would have returned {'Address': '1183 Pelham Wood Dr, Rock Hill, SC 29732'}, and I've used ' '.join(td.get_text(' ').split()) instead of just td.text (and same with l in labels) to minimize whitespace and get everything in one line.
Note: Both solutions become less reliable unless each label is for exactly one table; the second solution assumes that the table is directly adjacent to the label (and will skip any labels without an adjacent table with td tags); and the first solution risks taking a table from the next label if a label is missing a table after it.
Related
I have some HTML that I'm trying to extract specific information for, however it has repeating elements and I have an idea on how to account for this. I'm trying to implement conditional arguments that go as follows:
Extract the player names from the first href tag
search for the next tag named flaggenrahmen and extract the data in alt
If flaggenrahmen repeats again, skip.
Repeat steps.
what I have tried:
player_dict = defaultdict(list)
soup = BeautifulSoup(html)
player_id = soup.select('*[href]')
nation = soup.select('.flaggenrahmen')
for l,k in zip(player_id, nation):
player_dict[l.get_text(strip=True)].append(k['alt'])
However, I cannot get the 'skip' when flaggenrahmen repeats again, and therefore I get more than one country per player.
Produced output:
defaultdict(list,
{'': ['England', 'Spain', 'Portugal'],
'Trent Alexander-Arnold': ['Morocco'],
'Achraf Hakimi': ['England']})
Expected output:
{'Trent Alexander-Arnold':['England'],
'Achraf Hakimi':['Morocco'],
'João Cancelo':['Portugal'],
'Reece James':['England']
}
Here's the html data:
html='''<tbody>
<tr class="odd">
<td class="zentriert">1</td><td class=""><table class="inline-table"><tr><td rowspan="2"><img alt="Trent Alexander-Arnold" class="bilderrahmen-fixed" src="https://img.a.transfermarkt.technology/portrait/small/314353-1559826986.jpg?lm=1" title="Trent Alexander-Arnold"/></td><td class="hauptlink"><a class="spielprofil_tooltip" href="/trent-alexander-arnold/profil/spieler/314353" id="314353" title="Trent Alexander-Arnold">Trent Alexander-Arnold</a></td></tr><tr><td>Right-Back</td></tr></table></td><td class="zentriert">23</td><td class="zentriert"><img alt="England" class="flaggenrahmen" src="https://tmssl.akamaized.net/images/flagge/verysmall/189.png?lm=1520611569" title="England"/></td><td class="zentriert"><a class="vereinprofil_tooltip" href="/fc-liverpool/startseite/verein/31" id="31"><img alt="Liverpool FC" class="" src="https://tmssl.akamaized.net/images/wappen/verysmall/31.png?lm=1456567819" title=" "/></a></td><td class="rechts hauptlink"><b>£67.50m</b><span class="icons_sprite red-arrow-ten" title="£90.00m"> </span></td></tr>
<tr class="even">
<td class="zentriert">2</td><td class=""><table class="inline-table"><tr><td rowspan="2"><img alt="Achraf Hakimi" class="bilderrahmen-fixed" src="https://img.a.transfermarkt.technology/portrait/small/398073-1633679363.jpg?lm=1" title="Achraf Hakimi"/></td><td class="hauptlink"><a class="spielprofil_tooltip" href="/achraf-hakimi/profil/spieler/398073" id="398073" title="Achraf Hakimi">Achraf Hakimi</a></td></tr><tr><td>Right-Back</td></tr></table></td><td class="zentriert">22</td><td class="zentriert"><img alt="Morocco" class="flaggenrahmen" src="https://tmssl.akamaized.net/images/flagge/verysmall/107.png?lm=1520611569" title="Morocco"/><br/><img alt="Spain" class="flaggenrahmen" src="https://tmssl.akamaized.net/images/flagge/verysmall/157.png?lm=1520611569" title="Spain"/></td><td class="zentriert"><a class="vereinprofil_tooltip" href="/fc-paris-saint-germain/startseite/verein/583" id="583"><img alt="Paris Saint-Germain" class="" src="https://tmssl.akamaized.net/images/wappen/verysmall/583.png?lm=1522312728" title=" "/></a></td><td class="rechts hauptlink"><b>£63.00m</b><span class="icons_sprite green-arrow-ten" title="£54.00m"> </span></td></tr>
<tr class="odd">
<td class="zentriert">3</td><td class=""><table class="inline-table"><tr><td rowspan="2"><img alt="João Cancelo" class="bilderrahmen-fixed" src="https://img.a.transfermarkt.technology/portrait/small/182712-1615221629.jpg?lm=1" title="João Cancelo"/></td><td class="hauptlink"><a class="spielprofil_tooltip" href="/joao-cancelo/profil/spieler/182712" id="182712" title="João Cancelo">João Cancelo</a></td></tr><tr><td>Right-Back</td></tr></table></td><td class="zentriert">27</td><td class="zentriert"><img alt="Portugal" class="flaggenrahmen" src="https://tmssl.akamaized.net/images/flagge/verysmall/136.png?lm=1520611569" title="Portugal"/></td><td class="zentriert"><a class="vereinprofil_tooltip" href="/manchester-city/startseite/verein/281" id="281"><img alt="Manchester City" class="" src="https://tmssl.akamaized.net/images/wappen/verysmall/281.png?lm=1467356331" title=" "/></a></td><td class="rechts hauptlink"><b>£49.50m</b><span class="icons_sprite green-arrow-ten" title="£45.00m"> </span></td></tr>
<tr class="even">
<td class="zentriert">4</td><td class=""><table class="inline-table"><tr><td rowspan="2"><img alt="Reece James" class="bilderrahmen-fixed" src="https://img.a.transfermarkt.technology/portrait/small/472423-1569484519.png?lm=1" title="Reece James"/></td><td class="hauptlink"><a class="spielprofil_tooltip" href="/reece-james/profil/spieler/472423" id="472423" title="Reece James">Reece James</a></td></tr><tr><td>Right-Back</td></tr></table></td><td class="zentriert">21</td><td class="zentriert"><img alt="England" class="flaggenrahmen" src="https://tmssl.akamaized.net/images/flagge/verysmall/189.png?lm=1520611569" title="England"/></td><td class="zentriert"><a class="vereinprofil_tooltip" href="/fc-chelsea/startseite/verein/631" id="631"><img alt="Chelsea FC" class="" src="https://tmssl.akamaized.net/images/wappen/verysmall/631.png?lm=1628160548" title=" "/></a></td><td class="rechts hauptlink"><b>£40.50m</b><span class="icons_sprite green-arrow-ten" title="£36.00m"> </span></td></tr>
<tr class="odd">
<tbody>'''.replace('< ', '<')
this should do
players={}
soup = BeautifulSoup(html, 'lxml')
for el in soup.tbody.children:
if el.name!='tr':
continue
name=el.select_one('.spielprofil_tooltip')
country=el.select_one('.flaggenrahmen')
if name and country:
players[name.text]=[country['title']]
print(players)
>>> {'Trent Alexander-Arnold': ['England'], 'Achraf Hakimi': ['Morocco'], 'João Cancelo': ['Portugal'], 'Reece James': ['England']}
Using Beautiful Soup v4, I've some td elements, some of which contain a child a element.
<tr class="">
<td class="tblimg"><img alt="" src="/blah/deficon.png"/></td>
<td><b>file.mp3</b><br/><span
style="color: grey;">76.33 MB<br/>33129 Downloads<br/>55:34 Mins<br/>192kbps Stereo</span>
</td>
</tr>
Is there a good way to find only those td that have a child a? Currently, I'm iterating over all of them and discarding the ones for which td.find("a") doesn't exist.
Although you already have the answer, I would like to provide another solution for your reference:)
from simplified_scrapy import SimplifiedDoc
html = '''<table><tr class="">
<td class="tblimg"><img alt="" src="/blah/deficon.png"/></td>
<td><b>file.mp3</b><br/><span
style="color: grey;">76.33 MB<br/>33129 Downloads<br/>55:34 Mins<br/>192kbps Stereo</span>
</td>
</tr></table>
'''
doc = SimplifiedDoc(html) # create doc
# First get all a in the table, and then take the parent of a. All the data can be retrieved at one time.
tds = doc.selects('table>a').parent
print (tds)
Result:
[{'tag': 'td', 'html': '<b>file.mp3</b><br /><span style="color: grey;">76.33 MB<br />33129 Downloads<br />55:34 Mins<br />192kbps Stereo</span>\n '}]
I am trying to extract some simple fields from an HTML page. It is a table with some repetitive data.
Every record has a FIRST_NAME (and a bunch of other stuff) but not every record has a WEBSITE. So my xpath solution was returning 10 names but only 9 website urls.
fname= tree.xpath('//span[#class="given-name"]/text()')
fweb = tree.xpath('//a[#class="url"]/text()')
Using that method I can't tell which record is missing the url.
So now I want to divide the file into chunks; each chunk would start with the span class GIVEN-NAME and end right before the next GIVEN-NAME.
How do I do that? In my code, I have an infinite loop that keeps returning the first instance of span class FIRST-NAME, it doesn't progress through the HTML file.
with open('sample A.htm') as f:
soup = bs4.BeautifulSoup(f.read())
many_names= soup.find_all('span',class_='given-name')
print len(many_names)
for i in range(len(many_names)):
first_name = soup.find('span', class_='given-name').text
website = soup.find('a', class_='url').text
myprint (i, first_name, last_name, aco, city, qm, website)
soup.find_next('span', class_='given-name')
The last statement (find_next) doesn't seem to do anything.
With or without it, it's just loop that reads from the beginning over and over again. What is the right way to do this?
EDIT: sample from HTML file (I edited some out because there is a lot more)
Physically, the layout is span given-name blah blah blah URL buried in there somewhere, then another span given-name
enter code here
</div>
<div class="connections-list cn-list-body cn-clear" id="cn-list-body">
<div class="cn-list-section-head" id="cn-char-A"></div><div class="cn-list-row-alternate vcard individual art-literary-agents celebrity-nonfiction-literary-agents chick-lit-fiction-literary-agents commercial-fiction-literary-agents fiction-literary-agents film-entertainment-literary-agents history-nonfiction-literary-agents literary-fiction-literary-agents military-war-literary-agents multicultural-nonfiction-literary-agents multicultural-fiction-literary-agents music-literary-agents new-york-literary-agents-ny nonfiction-literary-agents photography-literary-agents pop-culture-literary-agents religion-nonfiction-literary-agents short-story-collection-literary-agents spirituality-literary-agents sports-nonfiction-literary-agents usa-literary-agents womens-issues-literary-agents" id="richard-abate" data-entry-type="individual" data-entry-id="19337" data-entry-slug="richard-abate"><div id="entry-id-193375501ffd6551a6" class="cn-entry">
<table border="0px" bordercolor="#E3E3E3" cellspacing="0px" cellpadding="0px">
<tr>
<td align="left" width="55%" valign="top">
<span class="cn-image-style"><span style="display: block; max-width: 100%; width: 125px"><img height="125" width="125" sizes="100vw" class="cn-image logo" alt="Logo for Richard Abate" title="Logo for Richard Abate" srcset="http://literaryagencies.com/wp-content/uploads/connections-images/richard-abate/richard-abate-literary-agent_logo_1-7bbdb1a0dbafe8417e994150608c55e4.jpg 1x" /></span></span>
</td>
<td align="right" valign="top" style="text-align: right;">
<div style="clear:both; margin: 5px 5px;">
<div style="margin-bottom: 5px;">
<span class="fn n"> <span class="given-name">Richard</span> <span class="family-name">Abate</span> </span>
<span class="title">3 Arts Entertainment</span>
<span class="org"><span class="organization-unit">Query method(s): Postal Mail *</span></span>
</div>
<span class="address-block">
<span class="adr"><span class="address-name">Work</span> <span class="street-address">16 West 22th St</span> <span class="locality">New York</span> <span class="region">NY</span> <span class="postal-code">10010</span> <span class="country-name">USA</span><span class="type" style="display: none;">work</span></span>
</span>
</div>
</td>
</tr>
<tr>
<td valign="bottom" style="text-align: left;">
<a class="cn-note-anchor toggle-div" id="note-anchor-193375501ffd6551a6" href="#" data-uuid="193375501ffd6551a6" data-div-id="note-block-193375501ffd6551a6" data-str-show="Show Notes" data-str-hide="Close Notes">Show Notes</a> | <a class="cn-bio-anchor toggle-div" id="bio-anchor-193375501ffd6551a6" href="#" data-uuid="193375501ffd6551a6" data-div-id="bio-block-193375501ffd6551a6" data-str-show="Show Bio" data-str-hide="Close Bio">Show Bio</a>
</td>
<td align="right" valign="bottom" style="text-align: right;">
<a class="url" href="http://www.3arts.com" target="new" rel="nofollow">http://www.3arts.com</a>
<span class="cn-image-style"><span style="display: block; max-width: 100%; width: 125px"><img height="125" width="125" sizes="100vw" class="cn-image logo" alt="Logo for Andree Abecassis" title="Logo for Andree Abecassis" srcset="http://literaryagencies.com/wp-content/uploads/connections-images/andree-abecassis/andree-abecassis-literary-agent_logo_1-b531cbac02864497b301e74bc6b37aa9.jpg 1x" /></span></span>
</td>
<td align="right" valign="top" style="text-align: right;">
<div style="clear:both; margin: 5px 5px;">
<div style="margin-bottom: 5px;">
<span class="fn n"> <span class="given-name">Andree</span> <span class="family-name">Abecassis</span> </span>
enter code here
I'm pretty sure it's not the case, assuming you're properly copied and pasted your code, that the last statement gives you a SyntaxError as you say; rather it will give you an AttributeError because you've mis-spelled the method name findNext calling it, instead, find_next for some mysterious reason. In general, copy and paste your traceback rather than trying to "paraphrase" it.
However, since you already have a list of all the spans with the relevant class, simplest is to change your second loop to search within each of them:
for i, a_span in enumerate(many_names):
first_name = a_span.text
website = a_span.find('a', class_='url')
if website is None:
website = '*MISSING*'
else:
website = website.text
last_name = aco = city = qm = 'YOU NEVER EXTRACT THESE!!!'
myprint(i, first_name, last_name, aco, city, qm, website)
assuming you have indeed defined a function myprint with all of these parameters.
You'll note I've set four variables to remind you that you never extract these values -- I suspect you'll want to fix that, right?-)
EDIT: as it now appears the relation between the tags being sought is not in the HTML's structure, but a fragile dependence on the mere sequence of the tags' occurrence in the HTML text, a very different approach is required. Here's a possibility:
from bs4 import BeautifulSoup
with open('ha.txt') as f:
soup = BeautifulSoup(f)
def tag_of_interest(t):
if t.name=='a': return t.attrs.get('class')==['url']
if t.name=='span': return t.attrs.get('class')==['given-name']
return False
for t in soup.find_all(tag_of_interest):
print(t)
E.g, when I save in ha.txt the HTML snippet now given in the Q after an edit, this script emits:
<span class="given-name">Richard</span>
<a class="url" href="http://www.3arts.com" rel="nofollow" target="new">http://www.3arts.com</a>
<span class="given-name">Andree</span>
So what now remains is to appropriately group the relevant sequence of tags (which I think will also include others, such as the spans with class last-name &c). A class seems appropriate (and functionality such as myprint could neatly be recast as methods of the class, but I'll skip that part).
class Entity(object):
def __init__(self)
self.first_name = self.last_name = self.website = None # &c
entities = []
for t in soup.find_all(tag_of_interest):
if t.name=='span' and t.class==['given-name']:
ent = Entity()
ent.given-name = t.text
entities.append(ent)
else:
if not entities:
print 'tag', t, 'out of context'
continue
ent = entities[-1]
if t.name=='a' and t.class==['url']:
ent.website = t.text
# etc for other tags of interest
In the end, the entities list can be examined for entities missing mandatory bits of data, and so forth.
I have the following HTML code:
<td class="image">
<a href="/target/tt0111161/" title="Target Text 1">
<img alt="target img" height="74" src="img src url" title="image title" width="54"/>
</a>
</td>
<td class="title">
<span class="wlb_wrapper" data-caller-name="search" data-size="small" data-tconst="tt0111161">
</span>
<a href="/target/tt0111161/">
Other Text
</a>
<span class="year_type">
(2013)
</span>
I am trying to use beautiful soup to parse certain elements into a tab-delimited file.
I got some great help and have:
for td in soup.select('td.title'):
span = td.select('span.wlb_wrapper')
if span:
print span[0].get('data-tconst') # To get `tt0082971`
Now I want to get "Target Text 1" .
I've tried some things like the above text such as:
for td in soup.select('td.image'): #trying to select the <td class="image"> tag
img = td.select('a.title') #from inside td I now try to look inside the a tag that also has the word title
if img:
print img[2].get('title') #if it finds anything, then I want to return the text in class 'title'
If you're trying to get a different td based on the class (i.e. td class="image" and td class="title" you can use beautiful soup as a dictionary to get the different classes.
This will find all the td class="image" in the table.
from bs4 import BeautifulSoup
page = """
<table>
<tr>
<td class="image">
<a href="/target/tt0111161/" title="Target Text 1">
<img alt="target img" height="74" src="img src url" title="image title" width="54"/>
</a>
</td>
<td class="title">
<span class="wlb_wrapper" data-caller-name="search" data-size="small" data-tconst="tt0111161">
</span>
<a href="/target/tt0111161/">
Other Text
</a>
<span class="year_type">
(2013)
</span>
</td>
</tr>
</table>
"""
soup = BeautifulSoup(page)
tbl = soup.find('table')
rows = tbl.findAll('tr')
for row in rows:
cols = row.find_all('td')
for col in cols:
if col.has_attr('class') and col['class'][0] == 'image':
hrefs = col.find_all('a')
for href in hrefs:
print href.get('title')
elif col.has_attr('class') and col['class'][0] == 'title':
spans = col.find_all('span')
for span in spans:
if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
print span.get('data-tconst')
span.wlb_wrapper is a selector used to select <span class="wlb_wrapper" data-caller-name="search" data-size="small" data-tconst="tt0111161">. Refer this & this for more information on selectors
change this in your python code span = td.select('span.wlb_wrapper') to span = td.select('span') & also span = td.select('span.year_type') and see what it returns.
If you try above and analyze what span holds you will get what you want.
I am learning and trying both Python (2.7) and Beautiful Soup (3.2.0). I already got some help here with my first problems (Beautiful Soup throws `IndexError`)
This is the Python code so far:
# Import the classes that are needed
import urllib2
from BeautifulSoup import BeautifulSoup
# URL to scrape and open it with the urllib2
url = 'http://www.wiziwig.tv/competition.php?competitionid=92&part=sports&discipline=football'
source = urllib2.urlopen(url)
# Turn the saced source into a BeautifulSoup object
soup = BeautifulSoup(source)
# From the source HTML page, search and store all <div class="date">...</div> and it's content
datesDiv = soup.findAll('div', { "class" : "date" })
# Loop through the tag and store only the needed information, being the actual date
dates = [tag.contents[0] for tag in datesDiv]
# From the source HTML page, search and store all <span class="time">...</span> and it's content
timesSpan = soup.findAll('span', { "class" : "time" })
# Loop through the tag and store only the needed information, being the actual times
times = [tag.contents[0] for tag in timesSpan]
# From the source HTML page, search and store all <td class="home">..</td> and it's content
hometeamsTd = soup.findAll('td', { "class" : "home" })
# Loop through the tag and store only the needed information, being the home team
# if tag.contents[1] != 'Dutch KNVB Beker' - Do a direct test if output is needed or not
hometeams = [tag.contents[1] for tag in hometeamsTd if tag.contents[1] != 'Dutch KNVB Beker']
# From the source HTML page, search and store all <td class="away">..</td> and it's content
# [1:] at the end meand slice the first one found
awayteamsTd = soup.findAll('td', { "class" : "away" })[1:]
# Loop through the tag and store only the needed information, being the away team
awayteams = [tag.contents[1] for tag in awayteamsTd]
# From the source HTML page, search and store all <a class="broadcast" href="...">..</a> and it's content
broadcastsA = soup.findAll('a', { "class" : "broadcast" })
# Loop through the tag and store only the needed information, being the the broadcast URL, where we can find the streams
broadcasts = [tag['href'] for tag in broadcastsA]
The problem I got is that the arrays are not equal to each other:
len(dates) #9, should be 6
len(times) #18, should be 12
len(hometeams) #6, is correct
len(awayteams) #6, is correct
len(broadcasts) #9, should be 6
Problem I have is that I do the following search for getting the dates array: soup.findAll('div', { "class" : "date" }). Which obviously gives me all the <div> elements with class="date". But the problem is, that I only need the date when there is also a <td> element with class="away".
See next part of the HTML that I am scraping:
<tr class="odd">
<td class="logo">
<img src="/gfx/disciplines/football.gif" alt="football"/>
</td>
<td>
Dutch Cup
<img src="/gfx/favourite_off.gif" class="fav off" alt="fav icon" id="comp-92"/>
</td>
<td>
<div class="date" rel="1380054900">Tuesday, September 24</div> <!-- This date is not needed, because within this <tr> there is no <td class="away"> -->
<span class="time" rel="1380054900">22:35</span> - <!-- This time is not needed, because within this <tr> there is no <td class="away"> -->
<span class="time" rel="1380058500">23:35</span> <!-- This time is not needed, because within this <tr> there is no <td class="away"> -->
</td>
<td class="home" colspan="3">
<img class="flag" src="/gfx/flags/nl.gif" alt="nl"/>Dutch KNVB Beker<img src="/gfx/favourite_off.gif" alt="fav icon" class="fav off" id="team-6758"/>
</td>
<td class="broadcast">
<a class="broadcast" href="/broadcast.php?matchid=221554&part=sports">Live</a> <!-- This href is not needed, because within this <tr> there is no <td class="away"> -->
</td>
</tr>
<tr class="even">
<td class="logo">
<img src="/gfx/disciplines/football.gif" alt="football"/>
</td>
<td>
Dutch Cup
<img src="/gfx/favourite_off.gif" class="fav off" alt="fav icon" id="comp-92"/>
</td>
<td>
<div class="date" rel="1380127500">Wednesday, September 25</div> <!-- This date we would like to have, because now all records are complete, there is a <td class="away"> in this <tr> -->
<span class="time" rel="1380127500">18:45</span> - <!-- This time we would like to have, because now all records are complete, there is a <td class="away"> in this <tr> -->
<span class="time" rel="1380134700">20:45</span> <!-- This date we would like to have, because now all records are complete, there is a <td class="away"> in this <tr> -->
</td>
<td class="home">
<img class="flag" src="/gfx/flags/nl.gif" alt="nl"/>PSV<img src="/gfx/favourite_off.gif" alt="fav icon" class="fav off" id="team-3"/>
</td>
<td>vs.</td>
<td class="away">
<img src="/gfx/favourite_off.gif" class="fav off" alt="fav icon" id="team-428"/>Stormvogels Telstar<img class="flag" src="/gfx/flags/nl.gif" alt="nl"/>
</td>
<td class="broadcast">
<a class="broadcast" href="/broadcast.php?matchid=221555&part=sports">Live</a> <!-- This href we would like to have, because now all records are complete, there is a <td class="away"> in this <tr> -->
</td>
</tr>
How about rethinking the way you scrape the data. You have a table with matches - then just iterate over the rows:
for tr in soup.findAll('tr', {'class': ['odd', 'even']}):
home_team = tr.find('td', {'class': 'home'}).text
if home_team == 'Dutch KNVB Beker':
continue
away_team = tr.find('td', {'class': 'away'}).text
date = ' - '.join([span.text for span in tr.findAll('span', {'class': 'time'})])
broadcast = tr.find('a', {'class': 'broadcast'})['href']
print home_team, away_team, date, broadcast
prints 5 rows:
RKC Waalwijk Heracles 20:45 - 22:45 /broadcast.php?matchid=221553&part=sports
PSV Stormvogels Telstar 18:45 - 20:45 /broadcast.php?matchid=221555&part=sports
Ajax FC Volendam 20:45 - 22:45 /broadcast.php?matchid=221556&part=sports
SC Heerenveen FC Twente 18:45 - 20:45 /broadcast.php?matchid=221558&part=sports
Feyenoord FC Dordrecht 20:45 - 22:45 /broadcast.php?matchid=221559&part=sports
Then, you can collect results into the list of dicts.