How to build DataFrame from two dicts Python - python

I am trying to build a dataframe, in which this attempt grabs data and column from dicts. (I tried doing this with pd.Series but I kept running into issues there, as well.)
import requests
import pandas as pd
from bs4 import BeautifulSoup
# get link and parse
page = requests.get('https://www.finviz.com/screener.ashx?v=111&ft=4')
soup = BeautifulSoup(page.text, 'html.parser')
# return 'Title's for each filter
# to be used as columns in dataframe
titles = soup.find_all('span', attrs={'class': 'screener-combo-title'})
title_list = []
for t in titles:
t = t.stripped_strings
t = ' '.join(t)
title_list.append(t)
title_list = {k: v for k, v in enumerate(title_list)}
# finding filters-cells tag id's
# to be used to build url
filters = soup.find_all('select', attrs={'data-filter': True})
filter_list = []
for f in filters:
filter_list.append(f.get('data-filter'))
# finding selectable values per cell
# to be used as data in dataframe
final_list = []
for f in filters:
options = f.find_all('option', attrs={'value': True})
option_list = [] # list needs to stay inside
for option in options:
if option['value'] != "":
option_list.append(option['value'])
final_list.append(option_list)
final_list = {k: v for k, v in enumerate(final_list)}
df = pd.DataFrame([final_list], columns=[title_list])
print(df)
This results in TypeError: unhashable type: 'dict' An example would look like (the first column is NOT the index):
Exchange Index ...
amex s&p500 ...
nasd djia
nyse

Here is an attempt to build a dict where key corresponds to filter values, and value corresponds to a list of possible choices. Does it suit your needs?
import requests
import pandas as pd
from bs4 import BeautifulSoup
# get link and parse
page = requests.get('https://www.finviz.com/screener.ashx?v=111&ft=4')
soup = BeautifulSoup(page.text, 'html.parser')
all_dict = {}
filters = soup.find_all('td', attrs={'class': 'filters-cells'})
for i in range(len(filters) // 2):
i_title = 2 * i
i_value = 2 * i + 1
sct = filters[i_title].find_all('span', attrs={'class': 'screener-combo-title'})
if len(sct)== 1:
title = ' '.join(sct[0].stripped_strings)
values = [v.text for v in filters[i_value].find_all('option', attrs={'value': True}) if v.text]
all_dict[title] = values
max_element = max([len(v) for v in all_dict.values()])
for k in all_dict:
all_dict[k] = all_dict[k] + [''] * (max_element - len(all_dict[k]))
df = pd.DataFrame.from_dict(all_dict)

Related

ValueError: setting an array element with a sequence. For pandas.concat

I have tried many ways to concatenate a list of DataFrames together but am continuously getting the error message "ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part."
At the moment the list only contains two elements, both of them being DataFrames. They do have different columns in places but i didn't think this would be an issue. At the moment I have:
df_year_stats = pd.concat(yearStats, axis = 0, ignore_index = True).reset_index(drop=True)
I don't think the dataframes have any lists in them but that is the only plausible solution I have thought of so far, if so how would I go about checking for these.
Any help would be greatly appreciated, thank you.
edit code:
import pandas as pd
from pandas.api.types import is_string_dtype
import requests
from bs4 import BeautifulSoup as bs
course_df = pd.read_csv("dg_course_table.csv")
soup = bs(requests.get('https://www.pgatour.com/stats/categories.ROTT_INQ.html').text, 'html.parser')
tabs = soup.find('div',attrs={'class','tabbable-head clearfix hidden-small'})
subStats = tabs.find_all('a')
# creating lists of tab and link, and removing the first and last
tab_links = []
tab_names = []
for subStat in subStats:
tab_names.append(subStat.text)
tab_links.append(subStat.get('href'))
tab_names = tab_names[1:-2] #potentially remove other areas here- points/rankings and streaks
tab_links = tab_links[1:-2]
# creating empty lists
stat_links = []
all_stat_names = []
# looping through each tab and extracting all of the stats URL's, along with the corresponding stat name.
for link in tab_links:
page2 = 'https://www.pgatour.com' + str(link)
req2 = requests.get(page2)
soup2 = bs(req2.text, 'html.parser')
# find correct part of html code
stat = soup2.find('section',attrs={'class','module-statistics-off-the-tee clearfix'})
specificStats = stat.find_all('a')
for stat in specificStats:
stat_links.append(stat.get('href'))
all_stat_names.append(stat.text)
s_asl = pd.Series(stat_links, index = all_stat_names )
s_asl = s_asl.drop(labels='show more')
s_asl = s_asl.str[:-4]
tourn_links = pd.Series([],dtype=('str'))
df_all_stats = []
req4 = requests.get('https://www.pgatour.com/content/pgatour/stats/stat.120.y2014.html')
soup4 = bs(req4.text, 'html.parser')
stat = soup4.find('select',attrs={'aria-label':'Available Tournaments'})
htm = stat.find_all('option')
for h in htm: #finding all tournament codes for the given year
z = pd.Series([h.get('value')],index=[h.text])
tourn_links = tourn_links.append(z)
yearStats = []
count = 0
for tournament in tourn_links[0:2]: # create stat tables for two different golf tournaments
print(tournament)
df1 = []
df_labels = []
for r in range(0,len(s_asl)): #loop through all stat links adding the corresponding stat to that tounaments df
try:
link = 'https://www.pgatour.com'+s_asl[r]+'y2014.eon.'+tournament+'.html'
web = pd.read_html(requests.get(link).text)
table = web[1].set_index('PLAYER NAME')
df1.append(table)
df_labels.append(s_asl.index[r])
except:
print("empty table")
try:
df_tourn_stats = pd.concat(df1,keys=df_labels,axis=1)
df_tourn_stats.reset_index(level=0, inplace=True)
df_tourn_stats.insert(1,'Tournament Name',tourn_links.index[count])
df_tourn_stats.to_csv(str(count) + ".csv")
df_tourn_stats = df_tourn_stats.loc[:,~df_tourn_stats.columns.duplicated()].copy()
yearStats.append(df_tourn_stats)
except:
print("NO DATA")
count= count + 1
#combine the stats of the two different tournaments into one dataframe
df_year_stats = pd.concat(yearStats, axis = 0, ignore_index = True).reset_index(drop=True)

Python BeautifulSoup4 Parsing: Hidden html elements on Yahoo Finance

I am analyzing the balance sheet of Amazon on Yahoo Finance. It contains nested rows, and I cannot extract all of them. The sheet looks like this:
I used BeautifulSoup4 and the Selenium web driver to get me the following output:
The following is the code:
import pandas as pd
from bs4 import BeautifulSoup
import re
from selenium import webdriver
import string
import time
# chart display specifications w/ Panda
pd.options.display.float_format = '{:.0f}'.format
pd.set_option('display.width', None)
is_link = 'https://finance.yahoo.com/quote/AMZN/balance-sheet/'
chrome_path = r"C:\\Users\\hecto\\Documents\\python\\drivers\\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get(is_link)
html = driver.execute_script('return document.body.innerHTML;')
soup = BeautifulSoup(html,'lxml')
features = soup.find_all('div', class_='D(tbr)')
headers = []
temp_list = []
label_list = []
final = []
index = 0
#create headers
for item in features[0].find_all('div', class_='D(ib)'):
headers.append(item.text)
#statement contents
while index <= len(features)-1:
#filter for each line of the statement
temp = features[index].find_all('div', class_='D(tbc)')
for line in temp:
#each item adding to a temporary list
temp_list.append(line.text)
#temp_list added to final list
final.append(temp_list)
#clear temp_list
temp_list = []
index+=1
df = pd.DataFrame(final[1:])
df.columns = headers
#function to make all values numerical
def convert_to_numeric(column):
first_col = [i.replace(',','') for i in column]
second_col = [i.replace('-','') for i in first_col]
final_col = pd.to_numeric(second_col)
return final_col
for column in headers[1:]:
df[column] = convert_to_numeric(df[column])
final_df = df.fillna('-')
print(df)
Again, I cannot seem to get all the rows of the balance sheet on my output (i.e. Cash, Total Current Assets). Where did I go wrong? Am I missing something?
You may have to click the "Expand All" button to see the additional rows. Refer to this thread to see how to simulate the click in Selenium: python selenium click on button

python pandas remove character

I am working on a project, and I need to remove the left and right most character of a data result. The data forms a scrape of craigslist, and the neighborhood results return as '(####)', but what I need it to be is ####. I am using pandas, and trying to use lstrip & rstrip. When I attempt it inside the python shell, it works, but when I use it on my data it does not work.
post_results['neighborhood'] = post_results['neighborhood'].str.lstrip('(')
post_results['neighborhood'] = post_results['neighborhood'].str.rstrip(')')
For some reason, the rstrip, does work and removes the ')' but the lstrip does not.
The full code is:
from bs4 import BeautifulSoup
import json
from requests import get
import numpy as np
import pandas as pd
import csv
print('hello world')
#get the initial page for the listings, to get the total count
response = get('https://washingtondc.craigslist.org/search/hhh?query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
results = html_result.find('div', class_='search-legend')
total = int(results.find('span',class_='totalcount').text)
pages = np.arange(0,total+1,120)
neighborhood = []
bedroom_count =[]
sqft = []
price = []
link = []
for page in pages:
#print(page)
response = get('https://washingtondc.craigslist.org/search/hhh?s='+str(page)+'query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
posts = html_result.find_all('li', class_='result-row')
for post in posts:
if post.find('span',class_='result-hood') is not None:
post_url = post.find('a',class_='result-title hdrlnk')
post_link = post_url['href']
link.append(post_link)
post_neighborhood = post.find('span',class_='result-hood').text
post_price = int(post.find('span',class_='result-price').text.strip().replace('$',''))
neighborhood.append(post_neighborhood)
price.append(post_price)
if post.find('span',class_='housing') is not None:
if 'ft2' in post.find('span',class_='housing').text.split()[0]:
post_bedroom = np.nan
post_footage = post.find('span',class_='housing').text.split()[0][:-3]
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
elif len(post.find('span',class_='housing').text.split())>2:
post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
post_footage = post.find('span',class_='housing').text.split()[2][:-3]
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
elif len(post.find('span',class_='housing').text.split())==2:
post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
post_footage = np.nan
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
else:
post_bedroom = np.nan
post_footage = np.nan
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
#create results data frame
post_results = pd.DataFrame({'neighborhood':neighborhood,'footage':sqft,'bedroom':bedroom_count,'price':price,'link':link})
#clean up results
post_results.drop_duplicates(subset='link')
post_results['footage'] = post_results['footage'].replace(0,np.nan)
post_results['bedroom'] = post_results['bedroom'].replace(0,np.nan)
post_results['neighborhood'] = post_results['neighborhood'].str.lstrip('(')
post_results['neighborhood'] = post_results['neighborhood'].str.rstrip(')')
post_results = post_results.dropna(subset=['footage','bedroom'],how='all')
post_results.to_csv("rent_clean.csv",index=False)
print(len(post_results.index))
This problem will happened when you have whitespace in the front
For example :
s=pd.Series([' (xxxx)','(yyyy) '])
s.str.strip('(|)')
0 (xxxx
1 yyyy)
dtype: object
What we can do is strip twice
s.str.strip().str.strip('(|)')
0 xxxx
1 yyyy
dtype: object
From my understanding of your question, you are removing characters from a string. You don't need pandas for this. Strings have a length and you can remove the first and last character like this;
new_word = old_word[1:-1]
This should work for you. Good luck.

How do I create a dataframe of jobs and companies that includes hyperlinks?

I am making a function to print a list of links so I can add them to a list of companies and job titles. However, I am having difficulties navigating tag sub-contents. I am looking to list all the 'href' in 'a' in 'div' like so:
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests
page = "https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html"
headers = {'User-Agent':'Mozilla/5.0'}
def get_soup():
session = requests.Session()
pageTree = session.get(page, headers=headers)
return BeautifulSoup(pageTree.content, 'html.parser')
pageSoup = get_soup()
def print_links():
"""this function scrapes the job title links"""
jobLink = [div.a for div in pageSoup.find_all('div', class_='title')]
for div in jobLink:
print(div['href'])
I am trying to make a list but my result is simply text and does not seem to be a link like so:
/pagead/clk?mo=r&ad=-6NYlbfkN0DhVAxkc_TxySVbUOs6bxWYWOfhmDTNcVTjFFBAY1FXZ2RjSBnfHw4gS8ZdlOOq-xx2DHOyKEivyG9C4fWOSDdPgVbQFdESBaF5zEV59bYpeWJ9R8nSuJEszmv8ERYVwxWiRnVrVe6sJXmDYTevCgexdm0WsnEsGomjLSDeJsGsHFLAkovPur-rE7pCorqQMUeSz8p08N_WY8kARDzUa4tPOVSr0rQf5czrxiJ9OU0pwQBfCHLDDGoyUdvhtXy8RlOH7lu3WEU71VtjxbT1vPHPbOZ1DdjkMhhhxq_DptjQdUk_QKcge3Ao7S3VVmPrvkpK0uFlA0tm3f4AuVawEAp4cOUH6jfWSBiGH7G66-bi8UHYIQm1UIiCU48Yd_pe24hfwv5Hc4Gj9QRAAr8ZBytYGa5U8z-2hrv2GaHe8I0wWBaFn_m_J10ikxFbh6splYGOOTfKnoLyt2LcUis-kRGecfvtGd1b8hWz7-xYrYkbvs5fdUJP_hDAFGIdnZHVJUitlhjgKyYDIDMJ-QL4aPUA-QPu-KTB3EKdHqCgQUWvQud4JC2Fd8VXDKig6mQcmHhZEed-6qjx5PYoSifi5wtRDyoSpkkBx39UO3F918tybwIbYQ2TSmgCHzGm32J4Ny7zPt8MPxowRw==&p=0&fvj=1&vjs=3
Additionally, here is my attempt at making a list with the links:
def get_job_titles():
"""this function scrapes the job titles"""
jobs = []
jobTitle = pageSoup.find_all('div', class_='title')
for span in jobTitle:
link = span.find('href')
if link:
jobs.append({'title':link.text,
'href':link.attrs['href']})
else:
jobs.append({'title':span.text, 'href':None})
return jobs
I would regex out from html returned the required info and construct the url from the parameters the page javascript uses to dynamically construct each url. Interestingly, the total number of listings is different when using requests than using browser. You can manually enter the number of listings e.g. 6175 (currently) or use the number returned by the request (which is lower and you miss some results). You could also use selenium to get the correct initial result count). You can then issue requests with offsets to get all listings.
Listings can be randomized in terms of ordering.
It seems you can introduce a limit parameter to increase results_per_page up to 50 e.g.
https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&limit=50&start=0
Furthermore, it seems that it is possible to retrieve more results that are actually given as the total results count on webpage.
py with 10 per page:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_listings = int(soup.select_one('[name=description]')['content'].split(' ')[0].replace(',',''))
#number_of_pages = math.ceil(number_of_listings/listings_per_page)
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )
If you want to use selenium to get correct initial listings count:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
d = webdriver.Chrome(r'C:\Users\HarrisQ\Documents\chromedriver.exe', options = options)
d.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
number_of_listings = int(d.find_element_by_css_selector('[name=description]').get_attribute('content').split(' ')[0].replace(',',''))
d.quit()
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )

Python: Using XPath to get data from a table

I'm trying to get data from the table at the bottom of http://projects.fivethirtyeight.com/election-2016/delegate-targets/.
import requests
from lxml import html
url = "http://projects.fivethirtyeight.com/election-2016/delegate-targets/"
response = requests.get(url)
doc = html.fromstring(response.text)
tables = doc.findall('.//table[#class="delegates desktop"]')
election = tables[0]
election_rows = election.findall('.//tr')
def extractCells(row, isHeader=False):
if isHeader:
cells = row.findall('.//th')
else:
cells = row.findall('.//td')
return [val.text_content() for val in cells]
import pandas
def parse_options_data(table):
rows = table.findall(".//tr")
header = extractCells(rows[1], isHeader=True)
data = [extractCells(row, isHeader=False) for row in rows[2:]]
return pandas.DataFrame(data, columns=header)
election_data = parse_options_data(election)
election_data
I'm having trouble with the topmost row with the candidates' names ('Trump', 'Cruz', 'Kasich'). It is under tr class="top" and right now I only have tr class="bottom" (starting with the row that says "won/target").
Any help is much appreciated!
The candidate names are in the 0-th row:
candidates = [val.text_content() for val in rows[0].findall('.//th')[1:]]
Or, if reusing the same extractCells() function:
candidates = extractCells(rows[0], isHeader=True)[1:]
[1:] slice here is to skip the first empty th cell.
Not good ( hard-coded ), but run as u want to.
def parse_options_data(table):
rows = table.findall(".//tr")
candidate = extractCells(rows[0], isHeader=True)[1:]
header = extractCells(rows[1], isHeader=True)[:3] + candidate
data = [extractCells(row, isHeader=False) for row in rows[2:]]
return pandas.DataFrame(data, columns=header)

Categories

Resources