Can't get data in table form using Selenium Python - python

Am new to scraping using selenium python. So i could retrieve some of the data, but i want it in table form as is displayed on the web page:
Here is what i have so far:
url='https://definitivehc.maps.arcgis.com/home/item.html?id=1044bb19da8d4dbfb6a96eb1b4ebf629&view=list&showFilters=false#data'
browser = webdriver.Chrome(r"C:\task\chromedriver")
browser.get(url)
time.sleep(25)
rows_in_table = browser.find_elements_by_xpath('//table[#class="dgrid-row-table"]//tr[th or td]')
for element in rows_in_table:
print(element.text.replace('\n', ''))
result snippet:
Hospital NameHospital TypeCityState AbrvZip CodeCounty NameState Name
Phoenix VA Health Care System (AKA Carl T Hayden VA Medical Center)VA HospitalPhoenixAZ85012MaricopaArizona040130401362620000.001
Southern Arizona VA Health Care SystemVA HospitalTucsonAZ85723PimaArizona04019040192952952202.002
VA Central California Health Care SystemVA HospitalFresnoCA93703FresnoCalifornia060190601954542202.003
VA Connecticut Healthcare System - West Haven Campus (AKA West Haven VA Medical Center)VA HospitalWest HavenCT6516New HavenConnecticut09009090092162161102.004
I will really appreciate a help form an expert on this. Thanks.

This is an updated version to what #Andrej answered, this code will download the table and instead of printing, saves it as an excel document.
import json
import requests
import pandas as pd
from pandas.io.json import json_normalize
config_url = 'https://definitivehc.maps.arcgis.com/sharing/rest/portals/self?culture=en-us&f=json'
page_url = 'https://services7.arcgis.com/{_id}/arcgis/rest/services/Definitive_Healthcare_USA_Hospital_Beds/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=OBJECTID%20ASC&resultOffset={offset}&resultRecordCount=50&cacheHint=true&quantizationParameters=%7B%22mode%22%3A%22edit%22%7D'
_id = requests.get(config_url).json()['id']
required=[]
offset = 0
while True:
data = requests.get(page_url.format(_id=_id, offset=offset)).json()
# uncommnet this to print all data:
#pprint(json.dumps(data, indent=4))
for i, f in enumerate(data['features'], offset+1):
required.append(f['attributes'])
if i % 50:
break
offset += 50
df=json_normalize(required)
with pd.ExcelWriter('dataFunction.xlsx', mode='A') as writer:
df.to_excel(writer)
I tried this and uploaded the excel sheet HERE(LINK TO EXCEL SHEET)!

The data is loaded dynamically using Javascript. You can use requests module to simulate those requests:
import json
import requests
config_url = 'https://definitivehc.maps.arcgis.com/sharing/rest/portals/self?culture=en-us&f=json'
page_url = 'https://services7.arcgis.com/{_id}/arcgis/rest/services/Definitive_Healthcare_USA_Hospital_Beds/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=OBJECTID%20ASC&resultOffset={offset}&resultRecordCount=50&cacheHint=true&quantizationParameters=%7B%22mode%22%3A%22edit%22%7D'
_id = requests.get(config_url).json()['id']
offset = 0
while True:
data = requests.get(page_url.format(_id=_id, offset=offset)).json()
# uncommnet this to print all data:
# print(json.dumps(data, indent=4))
for i, f in enumerate(data['features'], offset+1):
print(i, f['attributes'])
print('-' * 160)
if i % 50:
break
offset += 50
Prints all 6624 records:
...
6614 {'OBJECTID': 6614, 'HOSPITAL_NAME': 'Walter E Washington Convention Center Field Hospital (Temporarily Open due to COVID-19)', 'HOSPITAL_TYPE': 'Short Term Acute Care Hospital', 'HQ_ADDRESS': '801 Mount Vernon Pl Nw', 'HQ_ADDRESS1': None, 'HQ_CITY': 'Washington', 'HQ_STATE': 'DC', 'HQ_ZIP_CODE': '20001', 'COUNTY_NAME': 'District of Columbia', 'STATE_NAME': 'District of Columbia', 'STATE_FIPS': '11', 'CNTY_FIPS': '001', 'FIPS': '11001', 'NUM_LICENSED_BEDS': None, 'NUM_STAFFED_BEDS': None, 'NUM_ICU_BEDS': 0, 'ADULT_ICU_BEDS': 0, 'PEDI_ICU_BEDS': None, 'BED_UTILIZATION': None, 'Potential_Increase_In_Bed_Capac': 0, 'AVG_VENTILATOR_USAGE': None}
----------------------------------------------------------------------------------------------------------------------------------------------------------------
6615 {'OBJECTID': 6615, 'HOSPITAL_NAME': 'Joint Base Cape Cod Field Hospital (Temporarily Open due to COVID-19)', 'HOSPITAL_TYPE': 'Short Term Acute Care Hospital', 'HQ_ADDRESS': 'Connery Ave', 'HQ_ADDRESS1': None, 'HQ_CITY': 'Buzzards Bay', 'HQ_STATE': 'MA', 'HQ_ZIP_CODE': '2542', 'COUNTY_NAME': 'Barnstable', 'STATE_NAME': 'Massachusetts', 'STATE_FIPS': '25', 'CNTY_FIPS': '001', 'FIPS': '25001', 'NUM_LICENSED_BEDS': None, 'NUM_STAFFED_BEDS': None, 'NUM_ICU_BEDS': 0, 'ADULT_ICU_BEDS': 0, 'PEDI_ICU_BEDS': None, 'BED_UTILIZATION': None, 'Potential_Increase_In_Bed_Capac': 0, 'AVG_VENTILATOR_USAGE': None}
----------------------------------------------------------------------------------------------------------------------------------------------------------------
6616 {'OBJECTID': 6616, 'HOSPITAL_NAME': 'UMass Lowell Recreation Center Field Hospital (Temporarily Open due to COVID-19)', 'HOSPITAL_TYPE': 'Short Term Acute Care Hospital', 'HQ_ADDRESS': '322 Aiken St', 'HQ_ADDRESS1': None, 'HQ_CITY': 'Lowell', 'HQ_STATE': 'MA', 'HQ_ZIP_CODE': '1854', 'COUNTY_NAME': 'Middlesex', 'STATE_NAME': 'Massachusetts', 'STATE_FIPS': '25', 'CNTY_FIPS': '017', 'FIPS': '25017', 'NUM_LICENSED_BEDS': None, 'NUM_STAFFED_BEDS': None, 'NUM_ICU_BEDS': 0, 'ADULT_ICU_BEDS': 0, 'PEDI_ICU_BEDS': None, 'BED_UTILIZATION': None, 'Potential_Increase_In_Bed_Capac': 0, 'AVG_VENTILATOR_USAGE': None}
----------------------------------------------------------------------------------------------------------------------------------------------------------------
6617 {'OBJECTID': 6617, 'HOSPITAL_NAME': 'Miami Beach Convention Center Field Hospital (Temporarily Open due to COVID-19)', 'HOSPITAL_TYPE': 'Short Term Acute Care Hospital', 'HQ_ADDRESS': '1901 Convention Center Dr', 'HQ_ADDRESS1': None, 'HQ_CITY': 'Miami Beach', 'HQ_STATE': 'FL', 'HQ_ZIP_CODE': '33139', 'COUNTY_NAME': 'Miami-Dade', 'STATE_NAME': 'Florida', 'STATE_FIPS': '12', 'CNTY_FIPS': '086', 'FIPS': '12086', 'NUM_LICENSED_BEDS': None, 'NUM_STAFFED_BEDS': None, 'NUM_ICU_BEDS': 0, 'ADULT_ICU_BEDS': 0, 'PEDI_ICU_BEDS': None, 'BED_UTILIZATION': None, 'Potential_Increase_In_Bed_Capac': 0, 'AVG_VENTILATOR_USAGE': None}
...

Related

Trying to read table attributes on STATS CANADA website

I am working on integrating various API endpoints provided by Stats Canada. Most of them are easy to understand. One issue I am triaging with this task is to derive table level attributes.
Below resource has come as a great direction to me while implementing some of the available methods.
Python library for Stats Can API
Triage
import stats_can
sc_table='14-10-0027-01'
df = stats_can.sc.get_cube_metadata(sc_table)
df
above code works and renders output in JSON format. It renders Geography attribute as well with data respective to country and each provinces. But attribute Geography does not show value in JSON the way in image attached.
JSON output
Out[2]: [{'responseStatusCode': 0,
'productId': '14100027',
'cansimId': '282-0012',
'cubeTitleEn': 'Employment by class of worker, annual',
'cubeTitleFr': 'Emploi selon la catégorie de travailleur, données annuelles',
'cubeStartDate': '1976-01-01',
'cubeEndDate': '2022-01-01',
'frequencyCode': 12,
'nbSeriesCube': 6270,
'nbDatapointsCube': 229350,
'releaseTime': '2023-01-30T08:30',
'archiveStatusCode': '2',
'archiveStatusEn': 'CURRENT - a cube available to the public and that is current',
'archiveStatusFr': 'ACTIF - un cube qui est disponible au public et qui est toujours mise a jour',
'subjectCode': ['140201', '140299'],
'surveyCode': ['3701'],
'dimension': [{'dimensionPositionId': 1,
'dimensionNameEn': 'Geography',
'dimensionNameFr': 'Géographie',
'hasUom': False,
'member': [{'memberId': 1,
'parentMemberId': None,
'memberNameEn': 'Canada',
'memberNameFr': 'Canada',
'classificationCode': '11124',
'classificationTypeCode': '1',
'geoLevel': 0,
'vintage': 2016,
'terminated': 0,
'memberUomCode': None},
{'memberId': 2,
'parentMemberId': 1,
'memberNameEn': 'Newfoundland and Labrador',
'memberNameFr': 'Terre-Neuve-et-Labrador',
'classificationCode': '10',
'classificationTypeCode': '1',
'geoLevel': 2,
'vintage': 2016,
'terminated': 0,
'memberUomCode': None},
{'memberId': 3,
'parentMemberId': 1,
'memberNameEn': 'Prince Edward Island',
Any direction regarding this is appreciated.

Python yahoo finance data optimitzation

I've found a code here pretty good to retrieve some data I need (Python yahoo finance error market_cap=int(data.get_quote_yahoo(str)['marketCap']) TypeError: 'int' object is not callable):
tickers=["AAPL","GOOG","RY","HPQ"]
# Get market cap (not really necessary for you)
market_cap_data = web.get_quote_yahoo(tickers)['marketCap']
# Get the P/E ratio directly
pe_data = web.get_quote_yahoo(tickers)['trailingPE']
# print stock and p/e ratio
for stock, pe in zip(tickers, pe_data):
print(stock, pe)
# More keys that can be used
['language', 'region', 'quoteType', 'triggerable', 'quoteSourceName',
'currency', 'preMarketChange', 'preMarketChangePercent',
'preMarketTime', 'preMarketPrice', 'regularMarketChange',
'regularMarketChangePercent', 'regularMarketTime', 'regularMarketPrice',
'regularMarketDayHigh', 'regularMarketDayRange', 'regularMarketDayLow',
'regularMarketVolume', 'regularMarketPreviousClose', 'bid', 'ask',
'bidSize', 'askSize', 'fullExchangeName', 'financialCurrency',
'regularMarketOpen', 'averageDailyVolume3Month',
'averageDailyVolume10Day', 'fiftyTwoWeekLowChange',
'fiftyTwoWeekLowChangePercent', 'fiftyTwoWeekRange',
'fiftyTwoWeekHighChange', 'fiftyTwoWeekHighChangePercent',
'fiftyTwoWeekLow', 'fiftyTwoWeekHigh', 'dividendDate',
'earningsTimestamp', 'earningsTimestampStart', 'earningsTimestampEnd',
'trailingAnnualDividendRate', 'trailingPE',
'trailingAnnualDividendYield', 'marketState', 'epsTrailingTwelveMonths',
'epsForward', 'sharesOutstanding', 'bookValue', 'fiftyDayAverage',
'fiftyDayAverageChange', 'fiftyDayAverageChangePercent',
'twoHundredDayAverage', 'twoHundredDayAverageChange',
'twoHundredDayAverageChangePercent', 'marketCap', 'forwardPE',
'priceToBook', 'sourceInterval', 'exchangeDataDelayedBy', 'tradeable',
'firstTradeDateMilliseconds', 'priceHint', 'exchange', 'shortName',
'longName', 'messageBoardId', 'exchangeTimezoneName',
'exchangeTimezoneShortName', 'gmtOffSetMilliseconds', 'market',
'esgPopulated', 'price']
I would like to retrieve most of the commented fields at the end of the previous code, but I've done this so far:
import pandas_datareader as web
tickers = ["AAPL", "GOOG", "RY", "SAB.MC"]
market_cap_data = web.get_quote_yahoo(tickers)['marketCap']
pe_data = web.get_quote_yahoo(tickers)['trailingPE']
fiftytwo_low_data = web.get_quote_yahoo(tickers)['fiftyTwoWeekLowChangePercent']
for stock, mcap, pe, fiftytwo_low in zip(tickers, market_cap_data, pe_data, fiftytwo_low_data):
print(stock, mcap, pe, fiftytwo_low)
Obviously I could continue with my brute force, but do you know any way to make the code more elegant to retrieve the whole string of fields with column names?
['language', 'region', 'quoteType', 'triggerable', 'quoteSourceName',
'currency', 'preMarketChange', 'preMarketChangePercent',
'preMarketTime', 'preMarketPrice', 'regularMarketChange',
'regularMarketChangePercent', 'regularMarketTime', 'regularMarketPrice',
'regularMarketDayHigh', 'regularMarketDayRange', 'regularMarketDayLow',
'regularMarketVolume', 'regularMarketPreviousClose', 'bid', 'ask',
'bidSize', 'askSize', 'fullExchangeName', 'financialCurrency',
'regularMarketOpen', 'averageDailyVolume3Month',
'averageDailyVolume10Day', 'fiftyTwoWeekLowChange',
'fiftyTwoWeekLowChangePercent', 'fiftyTwoWeekRange',
'fiftyTwoWeekHighChange', 'fiftyTwoWeekHighChangePercent',
'fiftyTwoWeekLow', 'fiftyTwoWeekHigh', 'dividendDate',
'earningsTimestamp', 'earningsTimestampStart', 'earningsTimestampEnd',
'trailingAnnualDividendRate', 'trailingPE',
'trailingAnnualDividendYield', 'marketState', 'epsTrailingTwelveMonths',
'epsForward', 'sharesOutstanding', 'bookValue', 'fiftyDayAverage',
'fiftyDayAverageChange', 'fiftyDayAverageChangePercent',
'twoHundredDayAverage', 'twoHundredDayAverageChange',
'twoHundredDayAverageChangePercent', 'marketCap', 'forwardPE',
'priceToBook', 'sourceInterval', 'exchangeDataDelayedBy', 'tradeable',
'firstTradeDateMilliseconds', 'priceHint', 'exchange', 'shortName',
'longName', 'messageBoardId', 'exchangeTimezoneName',
'exchangeTimezoneShortName', 'gmtOffSetMilliseconds', 'market',
'esgPopulated', 'price']
thanks
Using the set, you can get all the items that can be retrieved by the ticker for the initial set, and using the union set, you can also add in a list, so you can get all the item names that have a value in the issue you want to retrieve.
import pandas_datareader as web
import pandas as pd
tickers = ["AAPL", "GOOG", "RY", "SAB.MC"]
names = set()
for t in tickers:
market_cap_data = web.get_quote_yahoo(t)
names |= set(market_cap_data.columns.to_list())
names
{'ask',
'askSize',
'averageAnalystRating',
'averageDailyVolume10Day',
'averageDailyVolume3Month',
'bid',
'bidSize',
'bookValue',
'cryptoTradeable',
'currency',
'customPriceAlertConfidence',
'displayName',
...
'trailingAnnualDividendYield',
'trailingPE',
'triggerable',
'twoHundredDayAverage',
'twoHundredDayAverageChange',
'twoHundredDayAverageChangePercent',
'typeDisp'}
I know this post is pretty old, but I just came across it now. Check out the 'yfinance' library. There's all kinds of stuff available over there!!
import pandas_datareader as web
import pandas as pd
df = web.DataReader('AAPL', data_source='yahoo', start='2011-01-01', end='2021-01-12')
df.head()
import yfinance as yf
aapl = yf.Ticker("AAPL")
aapl
# get stock info
aapl.info
# get historical market data
hist = aapl.history(period="max")
# show actions (dividends, splits)
aapl.actions
# show dividends
aapl.dividends
# show splits
aapl.splits
# show financials
aapl.financials
aapl.quarterly_financials
# show major holders
aapl.major_holders
# show institutional holders
aapl.institutional_holders
# show balance sheet
aapl.balance_sheet
aapl.quarterly_balance_sheet
# show cashflow
aapl.cashflow
aapl.quarterly_cashflow
# show earnings
aapl.earnings
aapl.quarterly_earnings
# show sustainability
aapl.sustainability
# show analysts recommendations
aapl.recommendations
# show next event (earnings, etc)
aapl.calendar
# show ISIN code - *experimental*
# ISIN = International Securities Identification Number
aapl.isin
# show options expirations
aapl.options
# get option chain for specific expiration
opt = aapl.option_chain('YYYY-MM-DD')
Result:
{'zip': '95014',
'sector': 'Technology',
'fullTimeEmployees': 164000,
'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. It also sells various related services. In addition, the company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. Further, it provides AppleCare support and cloud services store services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and podcasts. Additionally, the company offers various services, such as Apple Arcade, a game subscription service; Apple Fitness+, a personalized fitness service; Apple Music, which offers users a curated listening experience with on-demand radio stations; Apple News+, a subscription news and magazine service; Apple TV+, which offers exclusive original content; Apple Card, a co-branded credit card; and Apple Pay, a cashless payment service, as well as licenses its intellectual property. The company serves consumers, and small and mid-sized businesses; and the education, enterprise, and government markets. It distributes third-party applications for its products through the App Store. The company also sells its products through its retail and online stores, and direct sales force; and third-party cellular network carriers, wholesalers, retailers, and resellers. Apple Inc. was incorporated in 1977 and is headquartered in Cupertino, California.',
'city': 'Cupertino',
'phone': '408 996 1010',
'state': 'CA',
'country': 'United States',
'companyOfficers': [],
'website': 'https://www.apple.com',
'maxAge': 1,
'address1': 'One Apple Park Way',
'industry': 'Consumer Electronics',
'ebitdaMargins': 0.33105,
'profitMargins': 0.2531,
'grossMargins': 0.43310001,
'operatingCashflow': 122151002112,
'revenueGrowth': 0.081,
'operatingMargins': 0.30289,
'ebitda': 130541002752,
'targetLowPrice': 122,
'recommendationKey': 'buy',
'grossProfits': 170782000000,
'freeCashflow': 90215251968,
'targetMedianPrice': 180,
'currentPrice': 151.29,
'earningsGrowth': 0.048,
'currentRatio': 0.879,
'returnOnAssets': 0.21214001,
'numberOfAnalystOpinions': 41,
'targetMeanPrice': 178.15,
'debtToEquity': 261.446,
'returnOnEquity': 1.75459,
'targetHighPrice': 214,
'totalCash': 48304001024,
'totalDebt': 132480000000,
'totalRevenue': 394328014848,
'totalCashPerShare': 3.036,
'financialCurrency': 'USD',
'revenuePerShare': 24.317,
'quickRatio': 0.709,
'recommendationMean': 1.9,
'exchange': 'NMS',
'shortName': 'Apple Inc.',
'longName': 'Apple Inc.',
'exchangeTimezoneName': 'America/New_York',
'exchangeTimezoneShortName': 'EST',
'isEsgPopulated': False,
'gmtOffSetMilliseconds': '-18000000',
'quoteType': 'EQUITY',
'symbol': 'AAPL',
'messageBoardId': 'finmb_24937',
'market': 'us_market',
'annualHoldingsTurnover': None,
'enterpriseToRevenue': 6.317,
'beta3Year': None,
'enterpriseToEbitda': 19.081,
'52WeekChange': -0.06042725,
'morningStarRiskRating': None,
'forwardEps': 6.82,
'revenueQuarterlyGrowth': None,
'sharesOutstanding': 15908100096,
'fundInceptionDate': None,
'annualReportExpenseRatio': None,
'totalAssets': None,
'bookValue': 3.178,
'sharesShort': 103178670,
'sharesPercentSharesOut': 0.0064999997,
'fundFamily': None,
'lastFiscalYearEnd': 1663977600,
'heldPercentInstitutions': 0.60030997,
'netIncomeToCommon': 99802996736,
'trailingEps': 6.11,
'lastDividendValue': 0.23,
'SandP52WeekChange': -0.15323704,
'priceToBook': 47.60541,
'heldPercentInsiders': 0.00071999995,
'nextFiscalYearEnd': 1727136000,
'yield': None,
'mostRecentQuarter': 1663977600,
'shortRatio': 1.14,
'sharesShortPreviousMonthDate': 1664496000,
'floatShares': 15891414476,
'beta': 1.246644,
'enterpriseValue': 2490915094528,
'priceHint': 2,
'threeYearAverageReturn': None,
'lastSplitDate': 1598832000,
'lastSplitFactor': '4:1',
'legalType': None,
'lastDividendDate': 1667520000,
'morningStarOverallRating': None,
'earningsQuarterlyGrowth': 0.008,
'priceToSalesTrailing12Months': 6.103387,
'dateShortInterest': 1667174400,
'pegRatio': 2.71,
'ytdReturn': None,
'forwardPE': 22.183283,
'lastCapGain': None,
'shortPercentOfFloat': 0.0064999997,
'sharesShortPriorMonth': 103251184,
'impliedSharesOutstanding': 0,
'category': None,
'fiveYearAverageReturn': None,
'previousClose': 150.72,
'regularMarketOpen': 152.305,
'twoHundredDayAverage': 155.0841,
'trailingAnnualDividendYield': 0.005971337,
'payoutRatio': 0.14729999,
'volume24Hr': None,
'regularMarketDayHigh': 152.57,
'navPrice': None,
'averageDailyVolume10Day': 84360340,
'regularMarketPreviousClose': 150.72,
'fiftyDayAverage': 147.0834,
'trailingAnnualDividendRate': 0.9,
'open': 152.305,
'toCurrency': None,
'averageVolume10days': 84360340,
'expireDate': None,
'algorithm': None,
'dividendRate': 0.92,
'exDividendDate': 1667520000,
'circulatingSupply': None,
'startDate': None,
'regularMarketDayLow': 149.97,
'currency': 'USD',
'trailingPE': 24.761045,
'regularMarketVolume': 74496725,
'lastMarket': None,
'maxSupply': None,
'openInterest': None,
'marketCap': 2406736461824,
'volumeAllCurrencies': None,
'strikePrice': None,
'averageVolume': 89929545,
'dayLow': 149.97,
'ask': 150.95,
'askSize': 1000,
'volume': 74496725,
'fiftyTwoWeekHigh': 182.94,
'fromCurrency': None,
'fiveYearAvgDividendYield': 1,
'fiftyTwoWeekLow': 129.04,
'bid': 150.82,
'tradeable': False,
'dividendYield': 0.0061000003,
'bidSize': 1100,
'dayHigh': 152.57,
'coinMarketCapLink': None,
'regularMarketPrice': 151.29,
'preMarketPrice': None,
'logo_url': 'https://logo.clearb
Just pick/choose what you want.

Scraping profiles with Python and the "scrape-linkedin" package

I am trying to use the scrape_linkedin package. I follow the section on the github page on how to set up the package/LinkedIn li_at key (which I paste here for clarity).
Getting LI_AT
Navigate to www.linkedin.com and log in
Open browser developer tools (Ctrl-Shift-I or right click -> inspect element)
Select the appropriate tab for your browser (Application on Chrome, Storage on Firefox)
Click the Cookies dropdown on the left-hand menu, and select the www.linkedin.com option
Find and copy the li_at value
Once I collect the li_at value from my LinkedIn, I run the following code:
from scrape_linkedin import ProfileScraper
with ProfileScraper(cookie='myVeryLong_li_at_Code_which_has_characters_like_AQEDAQNZwYQAC5_etc') as scraper:
profile = scraper.scrape(url='https://www.linkedin.com/in/justintrudeau/')
print(profile.to_dict())
I have two questions (I am originally an R user).
How can I input a list of profiles:
https://www.linkedin.com/in/justintrudeau/
https://www.linkedin.com/in/barackobama/
https://www.linkedin.com/in/williamhgates/
https://www.linkedin.com/in/wozniaksteve/
and scrape the profiles? (In R I would use the map function from the purrr package to apply the function to each of the LinkedIn profiles).
The output (from the original github page) is returned in a JSON style format. My second question is how I can convert this into a pandas data frame (i.e. it is returned similar to the following).
{'personal_info': {'name': 'Steve Wozniak', 'headline': 'Fellow at
Apple', 'company': None, 'school': None, 'location': 'San Francisco
Bay Area', 'summary': '', 'image': '', 'followers': '', 'email': None,
'phone': None, 'connected': None, 'websites': [],
'current_company_link': 'https://www.linkedin.com/company/sandisk/'},
'experiences': {'jobs': [{'title': 'Chief Scientist', 'company':
'Fusion-io', 'date_range': 'Jul 2014 – Present', 'location': 'Primary
Data', 'description': "I'm looking into future technologies applicable
to servers and storage, and helping this company, which I love, get
noticed and get a lead so that the world can discover the new amazing
technology they have developed. My role is principally a marketing one
at present but that will change over time.", 'li_company_url':
'https://www.linkedin.com/company/sandisk/'}, {'title': 'Fellow',
'company': 'Apple', 'date_range': 'Mar 1976 – Present', 'location': '1
Infinite Loop, Cupertino, CA 94015', 'description': 'Digital Design
engineer.', 'li_company_url': ''}, {'title': 'President & CTO',
'company': 'Wheels of Zeus', 'date_range': '2002 – 2005', 'location':
None, 'description': None, 'li_company_url':
'https://www.linkedin.com/company/wheels-of-zeus/'}, {'title':
'diagnostic programmer', 'company': 'TENET Inc.', 'date_range': '1970
– 1971', 'location': None, 'description': None, 'li_company_url':
''}], 'education': [{'name': 'University of California, Berkeley',
'degree': 'BS', 'grades': None, 'field_of_study': 'EE & CS',
'date_range': '1971 – 1986', 'activities': None}, {'name': 'University
of Colorado Boulder', 'degree': 'Honorary PhD.', 'grades': None,
'field_of_study': 'Electrical and Electronics Engineering',
'date_range': '1968 – 1969', 'activities': None}], 'volunteering':
[]}, 'skills': [], 'accomplishments': {'publications': [],
'certifications': [], 'patents': [], 'courses': [], 'projects': [],
'honors': [], 'test_scores': [], 'languages': [], 'organizations':
[]}, 'interests': ['Western Digital', 'University of Colorado
Boulder', 'Western Digital Data Center Solutions', 'NEW Homebrew
Computer Club', 'Wheels of Zeus', 'SanDisk®']}
Firstly, You can create a custom function to scrape data and use map function in Python to apply it over each profile link.
Secondly, to create a pandas dataframe using a dictionary, you can simply pass the dictionary to pd.DataFrame.
Thus to create a dataframe df, with dictionary dict, you can do like this:
df = pd.DataFrame(dict)

Python Generators and how to iterate over correctly to drop records based on a key within the dictionary being present in a a separate list

I'm new to the concept of generators and I'm struggling with how to apply my changes to the records within the generator object returned from the RISparser module.
I understand that a generator only reads a record at a time and doesn't actually store the data in memory but I'm having a tough time iterating over it effectively and applying my changes.
My changes will involve dropping records that have not got ['doi'] values that are contained within a list of DOIs [doi_match].
doi_match = ['10.1002/14651858.CD008259.pub2','10.1002/14651858.CD011552','10.1002/14651858.CD011990']
Generator object returned form RISparser contains the following information, this is just the first 2 records returned of a few 100. I want to iterate over it and compare the 'doi': key from the generator with the list of DOIs.
{'type_of_reference': 'JOUR', 'title': "The CoRe Outcomes in WomeN's health (CROWN) initiative: Journal editors invite researchers to develop core outcomes in women's health", 'secondary_title': 'Neurourology and Urodynamics', 'alternate_title1': 'Neurourol. Urodyn.', 'volume': '33', 'number': '8', 'start_page': '1176', 'end_page': '1177', 'year': '2014', 'doi': '10.1002/nau.22674', 'issn': '07332467 (ISSN)', 'authors': ['Khan, K.'], 'keywords': ['Bias (epidemiology)', 'Clinical trials', 'Consensus', 'Endpoint determination/standards', 'Evidence-based medicine', 'Guidelines', 'Research design/standards', 'Systematic reviews', 'Treatment outcome', 'consensus', 'editor', 'female', 'human', 'medical literature', 'Note', 'outcomes research', 'peer review', 'randomized controlled trial (topic)', 'systematic review (topic)', "women's health", 'outcome assessment', 'personnel', 'publication', 'Female', 'Humans', 'Outcome Assessment (Health Care)', 'Periodicals as Topic', 'Research Personnel', "Women's Health"], 'publisher': 'John Wiley and Sons Inc.', 'notes': ['Export Date: 14 July 2020', 'CODEN: NEURE'], 'type_of_work': 'Note', 'name_of_database': 'Scopus', 'custom2': '25270392', 'language': 'English', 'url': 'https://www.scopus.com/inward/record.uri?eid=2-s2.0-84908368202&doi=10.1002%2fnau.22674&partnerID=40&md5=b220702e005430b637ef9d80a94dadc4'}
{'type_of_reference': 'JOUR', 'title': "The CROWN initiative: Journal editors invite researchers to develop core outcomes in women's health", 'secondary_title': 'Gynecologic Oncology', 'alternate_title1': 'Gynecol. Oncol.', 'volume': '134', 'number': '3', 'start_page': '443', 'end_page': '444', 'year': '2014', 'doi': '10.1016/j.ygyno.2014.05.005', 'issn': '00908258 (ISSN)', 'authors': ['Karlan, B.Y.'], 'author_address': 'Gynecologic Oncology and Gynecologic Oncology Reports, India', 'keywords': ['clinical trial (topic)', 'decision making', 'Editorial', 'evidence based practice', 'female infertility', 'health care personnel', 'human', 'outcome assessment', 'outcomes research', 'peer review', 'practice guideline', 'premature labor', 'priority journal', 'publication', 'systematic review (topic)', "women's health", 'editorial', 'female', 'outcome assessment', 'personnel', 'publication', 'Female', 'Humans', 'Outcome Assessment (Health Care)', 'Periodicals as Topic', 'Research Personnel', "Women's Health"], 'publisher': 'Academic Press Inc.', 'notes': ['Export Date: 14 July 2020', 'CODEN: GYNOA', 'Correspondence Address: Karlan, B.Y.; Gynecologic Oncology and Gynecologic Oncology ReportsIndia'], 'type_of_work': 'Editorial', 'name_of_database': 'Scopus', 'custom2': '25199578', 'language': 'English', 'url': 'https://www.scopus.com/inward/record.uri?eid=2-s2.0-84908351159&doi=10.1016%2fj.ygyno.2014.05.005&partnerID=40&md5=ab5a4d26d52c12d081e38364b0c79678'}
I tried iterating over the generator and applying the changes. But the records that have matches are not being placed in the match list.
match = []
for entry in ris_records:
if entry['doi'] in doi_match:
match.append(entry)
else:
del entry
any advice on how to iterate over a generator correctly, thanks.

flatten nested dictionary with dictionary embedded in lists (functional python)

This questions has been asked many times - but only once with this special case and I could partially find an answer here but it flattens down to every object.
I have this dictionary:
{'address': {'address_line_1': 'Floor Dekk House',
'address_line_2': 'Zippora Street Providence Industrial Estate',
'country': 'Seychelles',
'locality': 'Mahe',
'premises': '1st'},
'address_snippet': '1st, Floor Dekk House, Zippora Street Providence Industrial Estate, Mahe, Seychelles',
'appointment_count': 1,
'description': 'Total number of appointments 1',
'description_identifiers': ['appointment-count'],
'kind': 'searchresults#officer',
'links': {'self': '/officers/z7s5QUnhlYpAT8GvqvJ5snKmtHE/appointments'},
'matches': {'snippet': [], 'title': [1, 8, 10, 11]},
'snippet': '',
'title': 'ASTROCOM AG '}
As you can see "description_identifiers" and "matches.snippet" and "matches.title" have a list as value. I'd like to edit my code below to flatten my dictionary so that the json is flattened in a{key:value, key:value, key:value}` pair - but if the value is a list of atomic objects (not a list of lists or a list of dictionaries), the value is maintained as a list.
The objective is so be able to upload then this json to postgresql.
Here's some code i found online:
def flatten_json(dictionary):
"""Flatten a nested json file"""
def unpack(parent_key, parent_value):
"""Unpack one level of nesting in json file"""
# Unpack one level only!!!
if isinstance(parent_value, dict):
for key, value in parent_value.items():
temp1 = parent_key + '_' + key
yield temp1, value
elif isinstance(parent_value, list):
i = 0
for value in parent_value:
temp2 = parent_key + '_' +str(i)
i += 1
yield temp2, value
else:
yield parent_key, parent_value
# Keep iterating until the termination condition is satisfied
while True:
# Keep unpacking the json file until all values are atomic elements (not dictionary or list)
dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
# Terminate condition: not any value in the json file is dictionary or list
if not any(isinstance(value, dict) for value in dictionary.values()) and \
not any(isinstance(value, list) for value in dictionary.values()):
break
return dictionary
Desired output:
And to test, this dict:
Should not be (which is what I get now):
{'address_address_line_1': 'Floor Dekk House',
'address_address_line_2': 'Zippora Street Providence Industrial Estate',
'address_country': 'Seychelles',
'address_locality': 'Mahe',
'address_premises': '1st',
'address_snippet': '1st, Floor Dekk House, Zippora Street Providence Industrial Estate, Mahe, Seychelles',
'appointment_count': 1,
'description': 'Total number of appointments 1',
'description_identifiers_0': 'appointment-count',
'kind': 'searchresults#officer',
'links_self': '/officers/z7s5QUnhlYpAT8GvqvJ5snKmtHE/appointments',
'matches_title_0': 1,
'matches_title_1': 8,
'matches_title_2': 10,
'matches_title_3': 11,
'snippet': '',
'title': 'ASTROCOM AG '}
But rather
{'address_address_line_1': 'Floor Dekk House',
'address_address_line_2': 'Zippora Street Providence Industrial Estate',
'address_country': 'Seychelles',
'address_locality': 'Mahe',
'address_premises': '1st',
'address_snippet': '1st, Floor Dekk House, Zippora Street Providence Industrial Estate, Mahe, Seychelles',
'appointment_count': 1,
'description': 'Total number of appointments 1',
'description_identifiers_0': 'appointment-count',
'kind': 'searchresults#officer',
'links_self': '/officers/z7s5QUnhlYpAT8GvqvJ5snKmtHE/appointments',
'matches_title': [1, 8, 10, 11]
'snippet': '',
'title': 'ASTROCOM AG '}
You are almost done, except you need a little more check on the condition:
def flatten(dict_, prefix):
for k, v in dict_.items():
if isinstance(v, list) and len(v)==1:
if isinstance(v[0], dict):
for key, value in flatten(v[0], prefix+k+"_"):
yield key, value
else:
yield prefix+k+"_0", v[0]
elif isinstance(v, dict):
for key, value in flatten(v, prefix+k+"_"):
yield key, value
else:
yield prefix+k, v
Usage:
dict_ = {'address': {'address_line_1': 'Floor Dekk House',
'address_line_2': 'Zippora Street Providence Industrial Estate',
'country': 'Seychelles',
'locality': 'Mahe',
'premises': '1st'},
'address_snippet': '1st, Floor Dekk House, Zippora Street Providence Industrial Estate, Mahe, Seychelles',
'appointment_count': 1,
'description': 'Total number of appointments 1',
'description_identifiers': ['appointment-count'],
'kind': 'searchresults#officer',
'links': {'self': '/officers/z7s5QUnhlYpAT8GvqvJ5snKmtHE/appointments'},
'matches': {'snippet': [], 'title': [1, 8, 10, 11]},
'snippet': '',
'title': 'ASTROCOM AG '}
import json
print(json.dumps(dict(list(flatten(dict_, ""))), indent=4))
Output:
{
"address_address_line_1": "Floor Dekk House",
"address_address_line_2": "Zippora Street Providence Industrial Estate",
"address_country": "Seychelles",
"address_locality": "Mahe",
"address_premises": "1st",
"address_snippet": "1st, Floor Dekk House, Zippora Street Providence Industrial Estate, Mahe, Seychelles",
"appointment_count": 1,
"description": "Total number of appointments 1",
"description_identifiers_0": "appointment-count",
"kind": "searchresults#officer",
"links_self": "/officers/z7s5QUnhlYpAT8GvqvJ5snKmtHE/appointments",
"matches_snippet": [],
"matches_title": [
1,
8,
10,
11
],
"snippet": "",
"title": "ASTROCOM AG "
}

Categories

Resources