Why Beautiful Soup cannot display all <td> data in the tables? - python

I tried to page scrape wikipedia a week ago. But i could not figure out why Beautiful Soup will only show some string from the table column and show "none" for other table column.
NOTE: the table column all contains data.
My program will extract all table columns with the tag "description". I am trying to extract all the description from the table.
The website I am scraping is: http://en.wikipedia.org/wiki/Supernatural_(season_6)
This is my code:
from BeautifulSoup import BeautifulSoup
import urllib
import sys
from urllib import FancyURLopener
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.65 Safari/534.24'
def printList(rowList):
for row in rowList:
print row
print '\n'
return
url = "http://en.wikipedia.org/wiki/Supernatural_(season_6)"
#f = urllib.urlopen(url)
#content = f.read()
#f.close
myopener = MyOpener()
page = myopener.open(url)
content = page.read()
page.close()
soup = BeautifulSoup(''.join(content))
soup.prettify()
movieList = []
rowListTitle = soup.findAll('tr', 'vevent')
print len(rowListTitle)
#printList(rowListTitle)
for row in rowListTitle:
col = row.next # explain this?
if col != 'None':
col = col.findNext("b")
movieTitle = col.string
movieTuple = (movieTitle,'')
movieList.append(movieTuple)
#printList(movieList)
for row in movieList:
print row[0]
rowListDescription = soup.findAll('td' , 'description')
print len(rowListDescription)
index = 1;
while ( index < len(rowListDescription) ):
description = rowListDescription[index]
print description
print description.string
str = description
print '####################################'
movieList[index - 1] = (movieList[index - 1][0],description)
index = index + 1
I did not paste the output as it is really long. But the output is really weird as it did managed to capture the information in the <td> but when i do a .string, it gives me an empty content.

Do all the description strings come up empty? From the documentation:
For your convenience, if a tag has only one child node, and that child node is a string, the child node is made available as tag.string, as well as tag.contents[0].
In this case, the description often have child nodes, i.e.: a <a> link to another Wikipedia article. This counts as a non-string child node, in which case string for the description node is set to None.

Related

Why am I not seeing any results in my output from extracting indeed data using python

I am trying to run this code in idle 3.10.6 and I am not seeing any kind of data that should be extracted from Indeed. All this data should be in the output when I run it but it isn't. Below is the input statement
#Indeed data
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko"}
url = "https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}"
r = requests.get(url,headers)
soup = BeautifulSoup(r.content, "html.parser")
return soup
def transform(soup):
divs = soup.find_all("div", class_ = "jobsearch-SerpJobCard")
for item in divs:
title = item.find ("a").text.strip()
company = item.find("span", class_="company").text.strip()
try:
salary = item.find("span", class_ = "salarytext").text.strip()
finally:
salary = ""
summary = item.find("div",{"class":"summary"}).text.strip().replace("\n","")
job = {
"title":title,
"company":company,
'salary':salary,
"summary":summary
}
joblist.append(job)
joblist = []
for i in range(0,40,10):
print(f'Getting page, {i}')
c = extract(10)
transform(c)
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')
Here is the output I get
Getting page, 0
Getting page, 10
Getting page, 20
Getting page, 30
Empty DataFrame
Columns: []
Index: []
Why is this going on and what should I do to get that extracted data from indeed? What I am trying to get is the jobtitle,company,salary, and summary information. Any help would be greatly apprieciated.
The URL string includes {page}, bit it's not an f-string, so it's not being interpolated, and the URL you are fetching is:
https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}
That returns an error page.
So you should add an f before opening quote when you set url.
Also, you are calling extract(10) each time, instead of extract(i).
This is the correct way of using url
url = "https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}".format(page=page)
r = requests.get(url,headers)
here r.status_code gives an error 403 which means the request is forbidden.The site will block your request from fullfilling.use indeed job search Api

BeautifulSoup: adding strings to dictionary

I created a scraper, but I keep struggling with one part: getting the keywords associated with a movie/tv-show title.
I have a df with the following urls
keyword_link_list = ['https://www.imdb.com/title/tt7315526/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt11723916/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt7844164/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt2034855/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt11215178/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt10941266/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt13210836/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt0913137/keywords?ref_=tt_ql_sm']
df = pd.DataFrame({'keyword_link':keyword_link_list})
print(df)
Then, I like the loop through the column keyword_link, get all the keywords, and add them to a dictionary. I managed to get all the keywords, but I do not manage to add them to a dictionary. It seems like a simple problem, but I'm not seeing what I'm doing wrong (after hours of struggling). Many thanks in advance for your help!
# Import packages
import requests
import re
from bs4 import BeautifulSoup
import bs4 as bs
import pandas as pd
# Loop through column keyword_link and get the keywords for each link
keyword_dicts = []
for index, row in df.iterrows():
keyword_link = row['keyword_link']
print(keyword_link)
headers = {"Accept-Language": "en-US,en;q=0.5"}
r=requests.get(keyword_link, headers=headers)
html = r.text
soup = bs.BeautifulSoup(html, 'html.parser')
elements = soup.find_all('td', {'class':"soda sodavote"})
for element in elements:
for keyword in element.find_all('a'):
keyword = keyword['href']
keyword = re.sub(r'\/search/keyword\?keywords=', '', keyword)
keyword = re.sub(r'\?item=kw\d+', '', keyword)
print(keyword)
keyword_dict = {}
keyword_dict['keyword'] = keyword
keyword_dicts.append(keyword_dict)
print(keyword_dicts)
Update
After running the definition, I get the following error:
Note: cause expected output is not that clear and could be improved, this example deals with operating on your list only. you can use the output to create a dataframe, lists, ...
What happens?
Your dictionary is defined right behind the loop - You won't get any information to store and your list just contains [{'keyword': ''}]
How to fix?
Append your dictionary while iterating over the keywords.
Alternativ approach:
However, it do not need a dataframe and only one line to get your keywords:
keywords = [e.a.text for e in soup.select('[data-item-keyword]')]
In following example I come up with some variations on how and what could be collected:
Collect just the keywords separated by whitespace:
[e.a.text for e in soup.select('[data-item-keyword]')]
Collect same keywords separated by "-" as in the url:
['-'.join(x.split()) for x in keywords]
collect keywords and votings maybe also interesting:
[{'keyword':k,'votes':v} for k,v in zip(keywords,votes)]
Example
import requests, time
from bs4 import BeautifulSoup
import pandas as pd
keyword_link_list = ['https://www.imdb.com/title/tt7315526/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt11723916/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt7844164/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt2034855/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt11215178/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt10941266/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt13210836/keywords?ref_=tt_ql_sm',
'https://www.imdb.com/title/tt0913137/keywords?ref_=tt_ql_sm'
]
def cook_soup(url):
#do not harm the website add some delay
#time.sleep(2)
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36','Accept-Language': 'en-US,en;q=0.5'
}
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text,'lxml')
return soup
data = []
for url in keyword_link_list:
soup = cook_soup(url)
keywords = [e.a.text for e in soup.select('[data-item-keyword]')]
votes = [e['data-item-votes'] for e in soup.select('[data-item-votes]')]
data.append({
'url':url,
'keywords':keywords,
})
print(data)
### pd.DataFrame(data)
The problem with your code is that you're not saving the keywords in the loop. Also, instead of iterating over dataframe rows, create a function that does what you want and apply it on keyword_link column.
def get_keywords(row):
headers = {"Accept-Language": "en-US,en;q=0.5"}
r=requests.get(row, headers=headers)
# ^^^ replace keyword_link to row here
html = r.text
soup = bs.BeautifulSoup(html, 'html.parser')
elements = soup.find_all('td', {'class':"soda sodavote"})
keyword_dict = {'keyword':[]}
# ^^^ declare the dict here
for element in elements:
for keyword in element.find_all('a'):
keyword = keyword['href']
keyword = re.sub(r'\/search/keyword\?keywords=', '', keyword)
keyword = re.sub(r'\?item=kw\d+', '', keyword)
if keyword:
keyword_dict['keyword'].append(keyword)
# ^^^ move this inside the loop
return keyword_dict
However, it might be better to store list of keywords since the 'keyword' key is really doing nothing here.
Anyway, then you can use it as
df[keywords] = df['keyword_link'].apply(get_keywords)
Now, if you need a list of the keyword dictionaries, you can do
keyword_dicts = df[keywords].tolist()

BeautifulSoup: Get table that doesnt appear within the html?

I would like to obtain a table that appears on a URL: https://www.coronavirus.vic.gov.au/exposure-sites
When right-clicking and inspecting the element, it is evident there is a table element with a class that can be referenced. However, when requested this does not appear.
Reproducible example:
import pandas as pd
import requests
from bs4 import BeautifulSoup
header = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
link = 'https://www.coronavirus.vic.gov.au/exposure-sites'
r = requests.get(link, headers=header)
soup = BeautifulSoup(r.text, "html5lib")
htmltable = soup.find('table', { 'class' : "rpl-row rpl-search-results-layout__main rpl-row--gutter" })
# Error Appears here because the above doesn't exist even though it should?
print(htmltable)
def tableDataText(table):
"""Parses a html segment started with tag <table> followed
by multiple <tr> (table rows) and inner <td> (table data) tags.
It returns a list of rows with inner columns.
Accepts only one <th> (table header/data) in the first row.
"""
def rowgetDataText(tr, coltag='td'): # td (data) or th (header)
return [td.get_text(strip=True) for td in tr.find_all(coltag)]
rows = []
trs = table.find_all('tr')
headerow = rowgetDataText(trs[0], 'th')
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append(rowgetDataText(tr, 'td') ) # data row
return rows
list_table = tableDataText(htmltable)
df = pd.DataFrame(list_table[1:], columns=list_table[0])
df
The end state should be a table which is a collection of the 18 pages of tables on the webpage.
You can make call to this URL to get data as json format which returns list of dictionary data and loop over it data can be extraced from using key associated to it
import requests
from bs4 import BeautifulSoup
res=requests.get(" https://www.coronavirus.vic.gov.au/sdp-ckan?resource_id=afb52611-6061-4a2b-9110-74c920bede77&limit=10000")
data=res.json()
main_data=data['result']['records']
for i in range(len(main_data)):
print(main_data[i]['Suburb'])
print(main_data[i]['Site_title'])
Output:
Newport
TyrePlus Newport
Newport
TyrePlus Newport
Newport
...
How to find URL go to Chrome Developer mode and Network tab refresh your site and find data from image (lef hand side) and from preview you will get to kown about URL
Image:
For Dataframe:
import pandas as pd
df=pd.DataFrame(main_data)

Python web-scraping using Beautifulsoup: lowes stores

I am new to scraping. I am asked to get a list of store number, city, state from website: https://www.lowes.com/Lowes-Stores
Below is what I have tried so far. Since the structure does not have an attribute, I am not sure how to continue my code. Please guide!
import requests
from bs4 import BeautifulSoup
import json
from pandas import DataFrame as df
url = "https://www.lowes.com/Lowes-Stores"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page = requests.get(url, headers=headers)
page.encoding = 'ISO-885901'
soup = BeautifulSoup(page.text, 'html.parser')
lowes_list = soup.find_all(class_ = "list unstyled")
for i in lowes_list[:2]:
print(i)
example = lowes_list[0]
example_content = example.contents
example_content
You've found the list elements that contain the links that you need for state store lookups in your for loop. You will need to get the href attribute from the "a" tag inside each "li" element.
This is only the first step since you'll need to follow those links to get the store results for each state.
Since you know the structure of this state link result, you can simply do:
for i in lowes_list:
list_items = i.find_all('li')
for x in list_items:
for link in x.find_all('a'):
print(link['href'])
There are definitely more efficient ways of doing this, but the list is very small and this works.
Once you have the links for each state, you can create another request for each one to visit those store results pages. Then obtain the href attribute from those search results links on each state's page. The
Anchorage Lowe's
contains the city and the store number.
Here is a full example. I included lots of comments to illustrate the points.
You pretty much had everything up to Line 27, but you needed to follow the links for each state. A good technique for approaching these is to test the path out in your web browser first with the dev tools open, watching the HTML so you have a good idea of where to start with the code.
This script will obtain the data you need, but doesn't provide any data presentation.
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.lowes.com/Lowes-Stores"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
}
page = requests.get(url, headers=headers, timeout=5)
page.encoding = "ISO-885901"
soup = bs(page.text, "html.parser")
lowes_state_lists = soup.find_all(class_="list unstyled")
# we will store the links for each state in this array
state_stores_links = []
# now we populate the state_stores_links array by finding the href in each li tag
for ul in lowes_state_lists:
list_items = ul.find_all("li")
# now we have all the list items from the page, we have to extract the href
for li in list_items:
for link in li.find_all("a"):
state_stores_links.append(link["href"])
# This next part is what the original question was missing, following the state links to their respective search result pages.
# at this point we have to request a new page for each state and store the results
# you can use pandas, but an dict works too.
states_stores = {}
for link in state_stores_links:
# splitting up the link on the / gives us the parts of the URLs.
# by inspecting with Chrome DevTools, we can see that each state follows the same pattern (state name and state abbreviation)
link_components = link.split("/")
state_name = link_components[2]
state_abbreviation = link_components[3]
# let's use the state_abbreviation as the dict's key, and we will have a stores array that we can do reporting on
# the type and shape of this dict is irrelevant at this point. This example illustrates how to obtain the info you're after
# in the end the states_stores[state_abbreviation]['stores'] array will dicts each with a store_number and a city key
states_stores[state_abbreviation] = {"state_name": state_name, "stores": []}
try:
# simple error catching in case something goes wrong, since we are sending many requests.
# our link is just the second half of the URL, so we have to craft the new one.
new_link = "https://www.lowes.com" + link
state_search_results = requests.get(new_link, headers=headers, timeout=5)
stores = []
if state_search_results.status_code == 200:
store_directory = bs(state_search_results.content, "html.parser")
store_directory_div = store_directory.find("div", class_="storedirectory")
# now we get the links inside the storedirectory div
individual_store_links = store_directory_div.find_all("a")
# we now have all the stores for this state! Let's parse and save them into our store dict
# the store's city is after the state's abbreviation followed by a dash, the store number is the last thing in the link
# example: "/store/AK-Wasilla/2512"
for store in individual_store_links:
href = store["href"]
try:
# by splitting the href which looks to be consistent throughout the site, we can get the info we need
split_href = href.split("/")
store_number = split_href[3]
# the store city is after the -, so we have to split that element up into its two parts and access the second part.
store_city = split_href[2].split("-")[1]
# creating our store dict
store_object = {"city": store_city, "store_number": store_number}
# adding the dict to our state's dict
states_stores[state_abbreviation]["stores"].append(store_object)
except Exception as e:
print(
"Error getting store info from {0}. Exception: {1}".format(
split_href, e
)
)
# let's print something so we can confirm our script is working
print(
"State store count for {0} is: {1}".format(
states_stores[state_abbreviation]["state_name"],
len(states_stores[state_abbreviation]["stores"]),
)
)
else:
print(
"Error fetching: {0}, error code: {1}".format(
link, state_search_results.status_code
)
)
except Exception as e:
print("Error fetching: {0}. Exception: {1}".format(state_abbreviation, e))

How do I retrieve URLs and data from the URLs from a list of weblinks

"Hello, i am quite new to web-scraping. I recently retrieved a list of web-links and there are URLs within these links containing data from tables. I am planning to scrape the data but can't seem to even get the URLs. Any form of help is much appreciated"
"The list of weblinks are
https://aviation-safety.net/database/dblist.php?Year=1919
https://aviation-safety.net/database/dblist.php?Year=1920
https://aviation-safety.net/database/dblist.php?Year=1921
https://aviation-safety.net/database/dblist.php?Year=1922
https://aviation-safety.net/database/dblist.php?Year=2019"
"From the list of links, i am planning to
a. get the URLs within these links
https://aviation-safety.net/database/record.php?id=19190802-0
https://aviation-safety.net/database/record.php?id=19190811-0
https://aviation-safety.net/database/record.php?id=19200223-0"
"b. get data from tables within each URL
(e.g., Incident date, incident time, type, operator, registration, msn, first flight, classification)"
#Get the list of weblinks
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
headers = {'insert user agent'}
#start of code
mainurl = "https://aviation-safety.net/database/"
def getAndParseURL(mainurl):
result = requests.get(mainurl)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.find_all('a', href = True)
return datatable
datatable = getAndParseURL(mainurl)
#go through the content and grab the URLs
links = []
for link in datatable:
if 'Year' in link['href']:
url = link['href']
links.append(mainurl + url)
#check if links are in dataframe
df = pd.DataFrame(links, columns=['url'])
df.head(10)
#save the links to a csv
df.to_csv('aviationsafetyyearlinks.csv')
#from the csv read each web-link and get URLs within each link
import csv
from urllib.request import urlopen
contents = []
df = pd.read_csv('aviationsafetyyearlinks.csv')
urls = df['url']
for url in urls:
contents.append(url)
for url in contents:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
addtable = soup.find_all('a', href = True)
"I am only able to get the list of web-links and am unable to get the URLs nor the data within these web-links. The code continually shows arrays
not really sure where my code is wrong, appreciate any help and many thanks in advance."
While requesting the page.Add User Agent.
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
mainurl = "https://aviation-safety.net/database/dblist.php?Year=1919"
def getAndParseURL(mainurl):
result = requests.get(mainurl,headers=headers)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.select('a[href*="database/record"]')
return datatable
print(getAndParseURL(mainurl))

Categories

Resources