I have been messing around with python in the past few days and while following Edmund Martin's tutorial I ran into a problem:
I would like to append the name and title that I scraped to a CSV file.
Only problem is the data I scraped does not appear into the file.
Could you explain to me the logic of why only "rank" "description" and "title" is being written onto the CSV file and not the actual data. Also how can I solve that?
Below is the code I have found from the tutorial website with the last three lines that I added:
import requests
from bs4 import BeautifulSoup
import time
import csv
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 '
'Safari/537.36'}
def fetch_results(search_term, number_results, language_code):
assert isinstance(search_term, str), 'Search term must be a string'
assert isinstance(number_results, int), 'Number of results must be an integer'
escaped_search_term = search_term.replace(' ', '+')
google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(
escaped_search_term, number_results, language_code)
response = requests.get(google_url, headers=USER_AGENT)
response.raise_for_status()
return search_term, response.text
def parse_results(html, keyword):
soup = BeautifulSoup(html, 'html.parser')
found_results = []
rank = 1
result_block = soup.find_all('div', attrs={'class': 'g'})
for result in result_block:
link = result.find('a', href=True)
title = result.find('h3', attrs={'class': 'r'})
description = result.find('span', attrs={'class': 'st'})
if link and title:
link = link['href']
title = title.get_text()
description = description.get_text()
if link != '#':
found_results.append({
'rank': rank,
'title': title,
'description': description
})
rank += 1
return found_results
def scrape_google(search_term, number_results, language_code):
try:
keyword, html = fetch_results(search_term, number_results, language_code)
results = parse_results(html, keyword)
return results
except AssertionError:
raise Exception("Incorrect arguments parsed to function")
except requests.HTTPError:
raise Exception("You appear to have been blocked by Google")
except requests.RequestException:
raise Exception("Appears to be an issue with your connection")
if __name__ == '__main__':
keywords = ['python']
data = []
for keyword in keywords:
try:
results = scrape_google(keyword,2, "en")
for result in results:
data.append(result)
except Exception as e:
print(e)
finally:
time.sleep(1)
print(data)
with open('python_scrape.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(data)
csvFile.close()import requests
from bs4 import BeautifulSoup
import time
import csv
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 '
'Safari/537.36'}
def fetch_results(search_term, number_results, language_code):
assert isinstance(search_term, str), 'Search term must be a string'
assert isinstance(number_results, int), 'Number of results must be an integer'
escaped_search_term = search_term.replace(' ', '+')
google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(
escaped_search_term, number_results, language_code)
response = requests.get(google_url, headers=USER_AGENT)
response.raise_for_status()
return search_term, response.text
def parse_results(html, keyword):
soup = BeautifulSoup(html, 'html.parser')
found_results = []
rank = 1
result_block = soup.find_all('div', attrs={'class': 'g'})
for result in result_block:
link = result.find('a', href=True)
title = result.find('h3', attrs={'class': 'r'})
description = result.find('span', attrs={'class': 'st'})
if link and title:
link = link['href']
title = title.get_text()
description = description.get_text()
if link != '#':
found_results.append({
'rank': rank,
'title': title,
'description': description
})
rank += 1
return found_results
def scrape_google(search_term, number_results, language_code):
try:
keyword, html = fetch_results(search_term, number_results, language_code)
results = parse_results(html, keyword)
return results
except AssertionError:
raise Exception("Incorrect arguments parsed to function")
except requests.HTTPError:
raise Exception("You appear to have been blocked by Google")
except requests.RequestException:
raise Exception("Appears to be an issue with your connection")
if __name__ == '__main__':
keywords = ['python']
data = []
for keyword in keywords:
try:
results = scrape_google(keyword,2, "en")
for result in results:
data.append(result)
except Exception as e:
print(e)
finally:
time.sleep(1)
print(data)
with open('python_scrape.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(data)
csvFile.close()
Thanks for the help!
Because you're using csv.writer.writerows (which ends in 's', rows is plural), rather than writerow, csv writer expects a list of "iterable objects", which it will treat as rows.
Your main() function uses scrape_google() to return a list of dictionaries, which are all like {'rank': rank, 'title': title, 'description': description}.
Python iterates through dictionaries by returning each key, so what writerows is seeing is just the keys "rank", "title", and "description" in each row.
The fastest way to fix what is happening is to add a line
results = [[j[i] for i in j] for j in results]
before your "with open('python_scrape.csv'..." line. This uses list comprehension, which is a good thing to learn about as a new python user.
A better way to fix your code would be to make sure that it is building up a list of lists to be written to the csv instead of a list of dictionaries.
def parse_results(html, keyword):
# code ....
for result in result_block:
link = result.find('a', href=True) # here you get links
title = result.find('h3', attrs={'class': 'r'}) # here you get title
description = result.find('span', attrs={'class': 'st'}) # here you get description
# if you want something to search here
# for example you can print(result) here an see what data have result variable
# and after that parse that data and save in variable for example
# body = result.find('h1', attrs={'class': 'h1'})
if link and title:
link = link['href']
title = title.get_text()
description = description.get_text()
# here we take text from that body
# body = body.get_text()
if link != '#':
found_results.append({
'rank': rank,
'title': title,
'description': description,
# and here we append to list
'body': body
})
rank += 1
return found_results
Related
I am iterating through a list of URL's to extract 5 items from each URL using BeautifulSoup.find. The list contains about 2000 URL's. Because not every webpage is guaranteed to have all 5 items, I used try and except appropriately.
After completing the loop, I noticed 3 things:
The very first 5-10 links would run seamlessly meaning I would successfully retrieve all 5 items (none of the except blocks were used).
The overwhelming majority of URL's try blocks did not execute, and therefore it ran the except block for each item.
Everyone once in a while, a URL's try blocks DID execute and I would successfully retrieve all 5 items.
I placed the results in a list of dictionaries, and then created a dataframe.
cleanserlist = []
for link in productlinks:
try:
r = requests.get(link, headers=headers, timeout=3.05)
except requests.exceptions.Timeout:
print("Timeout occurred")
soup = BeautifulSoup(r.content, 'lxml')
try:
price = soup.find('span', class_="sellingPrice").text.strip()
except:
price = 'no price'
try:
name = soup.find('h1', class_='flex flex-xs-100').text.strip()
except:
name = 'no name'
try:
ingredients = soup.find('div', class_='v-pane-content').text.strip()
except:
ingredients = 'no ingredients'
try:
rating = soup.find('div', class_='ratingValue').text.strip()
except:
rating = 'no rating'
try:
reviews = soup.find('span', class_='reviewCount').text.strip()
except:
reviews = 'no reviews'
cleanser = {
'name': name,
'price': price,
'rating': rating,
'reviews' : reviews,
'ingredients': ingredients
}
cleanserlist.append(cleanser)
sleep(randint(1,3))
image of first 44 rows of dataframe
image of subsequent 44 rows of dataframe
A "table driven" approach is highly appropriate for this kind of thing and makes for easier extensibility.
Given that there are a large number of URLs to [try to] access then a multithreaded approach is highly desirable for potentially greatly improved performance.
Here's an example of that kind of approach:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
CONTROL = [('price', 'span', 'sellingPrice'),
('name', 'h1', 'flex flex-xs-100'),
('ingredients', 'div', 'v-pane-content'),
('rating', 'div', 'ratingValue'),
('reviews', 'span', 'reviewCount')
]
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'
TIMEOUT = 3.05
cleanserlist = []
productlinks = [] # list of URLs
headers = {'User-Agent': AGENT} # potentially more complex
def process(link):
try:
(r := requests.get(link, headers=headers, timeout=TIMEOUT)).raise_for_status()
cleanser = {}
soup = BeautifulSoup(r.text, 'lxml')
for v, e, c in CONTROL:
try:
cleanser[v] = soup.find(e, class_=c).text.strip()
except Exception:
cleanser[v] = f'no {v}'
cleanserlist.append(cleanser)
except Exception as e:
print(f'Error processing {link} due to {e}')
def main():
with ThreadPoolExecutor() as executor:
executor.map(process, productlinks)
print(cleanserlist)
if __name__ == '__main__':
main()
I want to scrape data from a website but I'm getting one error. As I'm new to web scraping so plese guide me how to fix this issue. here is the issue that I am facing UnboundLocalError: local variable 'soup' referenced before assignment
here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="ProductTitle-title",id=False).text)
except:
title = 'Empty Title'
try:
title = (soup.find('h1',class_="TopBar-perUnit TopBar-perUnitTop",id=False).text)
except:
price = 'Empty price'
try:
img = (soup.find('img',class_="ViewSelectorItem-image",id=False).get('src'))
except:
img = 'Empty img'
data = {
'Title' : title,
'Price' : price,
'Img' : img
}
print(data)
def main():
url = "https://www.zazzle.com/60th_silver_diamond_anniversary_photo_invitations-161837951427094549"
get_detail_page(get_page(url))
if __name__ == '__main__':
main()
I've added the user-agent to your code:
import urllib.request as urllib2
from bs4 import BeautifulSoup
import csv
REQUEST_HEADER = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
def get_page(url):
req = urllib2.Request(url, headers=REQUEST_HEADER)
page = urllib2.urlopen(req, timeout=20).read()
soup = BeautifulSoup(page, "html.parser")
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="ProductTitle-title",id=False).text)
except:
title = 'Empty Title'
try:
title = (soup.find('h1',class_="TopBar-perUnit TopBar-perUnitTop",id=False).text)
except:
price = 'Empty price'
try:
img = (soup.find('img',class_="ViewSelectorItem-image",id=False).get('src'))
except:
img = 'Empty img'
data = {
'Title' : title,
'Price' : price,
'Img' : img
}
print(data)
def main():
url = "https://www.zazzle.com/60th_silver_diamond_anniversary_photo_invitations-161837951427094549"
get_detail_page(get_page(url))
if __name__ == '__main__':
main()
Also, a pretty interesting read: Google Chrome: Change the User-Agent String
I am trying to scrape some Tripadvisor reviews as a complete newbie to this.
I'm using code from Susanli2016.
The problem is that it continues looping. Once it has parsed all reviews, it goes back to the most recent ones and starts over - therefore, it doesn't produce the .csv with all reviews.
The funniest part is that I am only encountering issues with some venues (for example) and not others (such as this one)
I'm attaching the code here in case someone can help me.
--
Hereby the complete code:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', class_='heading').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')
I am trying to scrape some Tripadvisor reviews as a complete newbie to this.
I'm using code from Susanli2016.
It worked (though, removing the attribute "language") for one link but it doesn't work for any more link (for example.)
I'm receiving the error:
Traceback (most recent call last):
File "<pyshell#37>", line 4, in <module>
items = scrape(url)
File "<pyshell#13>", line 11, in scrape
items = parse(session, url + '?filterLang=' + lang)
File "<pyshell#18>", line 15, in parse
num_reviews = int(num_reviews) # convert text into integer
ValueError: invalid literal for int() with base 10: '5.695'
(where 5,695 is the number of reviews in the page)
I'm attaching the code here in case someone can help me.
Thank you so much!
Silvia
--
Hereby the complete code:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', id='HEADING').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
lang = 'it'
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')
Thanks to all the commenters. I realized the issue lied in the Italian and US paradigm for writing thousand separators (we use ".", whereas the americans use ",").
You seem to have the following string for number of views 5.695 before trying to type cast it to int with num_reviews = int(num_reviews).
Probably the . in 5.695 is a thousands separator.
So remove the . like this before using int():
num_reviews = num_reviews.replace('.', '')
num_reviews = int(num_reviews)
The error is due to the full stop in the int you are trying to convert. To make sure it works with all typing formats, you need to filter for numerical characters only before converting to int:
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '').replace('.','')
num_reviews = int(num_reviews)
Or more in a more generic way, only include numerical chars in the string num_reviews
You cannot be parsed directly to an integer value, In this case you first convert it into float then if you want convert it as Int.
num_reviews = int(float(num_reviews))
I'm trying to scrape website from a job postings data, and the output looks like this:
[{'job_title': 'Junior Data Scientist','company': '\n\n BBC',
summary': "\n We're now seeking a Junior Data Scientist to
come and work with our Marketing & Audiences team in London. The Data
Science team are responsible for designing...", 'link':
'www.jobsite.com',
'summary_text': "Job Introduction\nImagine if Netflix, The Huffington Post, ESPN, and Spotify were all rolled into one....etc
I want to create a dataframe, or a CSV, that looks like this:
right now, this is the loop I'm using:
for page in pages:
source = requests.get('https://www.jobsite.co.uk/jobs?q=data+scientist&start='.format()).text
soup = BeautifulSoup(source, 'lxml')
results = []
for jobs in soup.findAll(class_='result'):
result = {
'job_title': '',
'company': '',
'summary': '',
'link': '',
'summary_text': ''
}
and after using the loop, I just print the results.
What would be a good way to get the output in a dataframe? Thanks!
Look at the pandas Dataframe API. There are several ways you can initialize a dataframe
list of dictionaries
list of lists
You just need to append either a list or a dictionary to a global variable, and you should be good to go.
results = []
for page in pages:
source = requests.get('https://www.jobsite.co.uk/jobs?q=data+scientist&start='.format()).text
soup = BeautifulSoup(source, 'lxml')
for jobs in soup.findAll(class_='result'):
result = {
'job_title': '', # assuming this has value like you shared in the example in your question
'company': '',
'summary': '',
'link': '',
'summary_text': ''
}
results.append(result)
# results is now a list of dictionaries
df= pandas.DataFrame(results)
One other suggestion, don't think about dumping this in a dataframe within the same program. Dump all your HTML files first into a folder, and then parse them again. This way if you need more information from the page which you hadn't considered before, or if a program terminates due to some parsing error or timeout, the work is not lost. Keep parsing separate from crawling logic.
I think you need to define the number of pages and add that into your url (ensure you have a placeholder for the value which I don't think your code, nor the other answer have). I have done this via extending your url to include a page parameter in the querystring which incorporates a placeholder.
Is your selector of class result correct? You could certainly also use for job in soup.select('.job'):. You then need to define appropriate selectors to populate values. I think it easier to grab all the job links for each page then visit the page and extract the values from a json like string in the page. Add Session to re-use connection.
Explicit waits required to prevent being blocked
import requests
from bs4 import BeautifulSoup as bs
import json
import pandas as pd
import time
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
results = []
links = []
pages = 3
with requests.Session() as s:
for page in range(1, pages + 1):
try:
url = 'https://www.jobsite.co.uk/jobs?q=data+scientist&start=1&page={}'.format(page)
source = s.get(url, headers = headers).text
soup = bs(source, 'lxml')
links.append([link['href'] for link in soup.select('.job-title a')])
except Exception as e:
print(e, url )
finally:
time.sleep(2)
final_list = [item for sublist in links for item in sublist]
for link in final_list:
source = s.get(link, headers = headers).text
soup = bs(source, 'lxml')
data = soup.select_one('#jobPostingSchema').text #json like string containing all info
item = json.loads(data)
result = {
'Title' : item['title'],
'Company' : item['hiringOrganization']['name'],
'Url' : link,
'Summary' :bs(item['description'],'lxml').text
}
results.append(result)
time.sleep(1)
df = pd.DataFrame(results, columns = ['Title', 'Company', 'Url', 'Summary'])
print(df)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8-sig',index = False )
Sample of results:
I can't imagine you want all pages but you could use something similar to:
import requests
from bs4 import BeautifulSoup as bs
import json
import pandas as pd
import time
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
results = []
links = []
pages = 0
def get_links(url, page):
try:
source = s.get(url, headers = headers).text
soup = bs(source, 'lxml')
page_links = [link['href'] for link in soup.select('.job-title a')]
if page == 1:
global pages
pages = int(soup.select_one('.page-title span').text.replace(',',''))
except Exception as e:
print(e, url )
finally:
time.sleep(1)
return page_links
with requests.Session() as s:
links.append(get_links('https://www.jobsite.co.uk/jobs?q=data+scientist&start=1&page=1',1))
for page in range(2, pages + 1):
url = 'https://www.jobsite.co.uk/jobs?q=data+scientist&start=1&page={}'.format(page)
links.append(get_links(url, page))
final_list = [item for sublist in links for item in sublist]
for link in final_list:
source = s.get(link, headers = headers).text
soup = bs(source, 'lxml')
data = soup.select_one('#jobPostingSchema').text #json like string containing all info
item = json.loads(data)
result = {
'Title' : item['title'],
'Company' : item['hiringOrganization']['name'],
'Url' : link,
'Summary' :bs(item['description'],'lxml').text
}
results.append(result)
time.sleep(1)
df = pd.DataFrame(results, columns = ['Title', 'Company', 'Url', 'Summary'])
print(df)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8-sig',index = False )