I am working on a project, and I need to remove the left and right most character of a data result. The data forms a scrape of craigslist, and the neighborhood results return as '(####)', but what I need it to be is ####. I am using pandas, and trying to use lstrip & rstrip. When I attempt it inside the python shell, it works, but when I use it on my data it does not work.
post_results['neighborhood'] = post_results['neighborhood'].str.lstrip('(')
post_results['neighborhood'] = post_results['neighborhood'].str.rstrip(')')
For some reason, the rstrip, does work and removes the ')' but the lstrip does not.
The full code is:
from bs4 import BeautifulSoup
import json
from requests import get
import numpy as np
import pandas as pd
import csv
print('hello world')
#get the initial page for the listings, to get the total count
response = get('https://washingtondc.craigslist.org/search/hhh?query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
results = html_result.find('div', class_='search-legend')
total = int(results.find('span',class_='totalcount').text)
pages = np.arange(0,total+1,120)
neighborhood = []
bedroom_count =[]
sqft = []
price = []
link = []
for page in pages:
#print(page)
response = get('https://washingtondc.craigslist.org/search/hhh?s='+str(page)+'query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
posts = html_result.find_all('li', class_='result-row')
for post in posts:
if post.find('span',class_='result-hood') is not None:
post_url = post.find('a',class_='result-title hdrlnk')
post_link = post_url['href']
link.append(post_link)
post_neighborhood = post.find('span',class_='result-hood').text
post_price = int(post.find('span',class_='result-price').text.strip().replace('$',''))
neighborhood.append(post_neighborhood)
price.append(post_price)
if post.find('span',class_='housing') is not None:
if 'ft2' in post.find('span',class_='housing').text.split()[0]:
post_bedroom = np.nan
post_footage = post.find('span',class_='housing').text.split()[0][:-3]
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
elif len(post.find('span',class_='housing').text.split())>2:
post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
post_footage = post.find('span',class_='housing').text.split()[2][:-3]
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
elif len(post.find('span',class_='housing').text.split())==2:
post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
post_footage = np.nan
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
else:
post_bedroom = np.nan
post_footage = np.nan
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
#create results data frame
post_results = pd.DataFrame({'neighborhood':neighborhood,'footage':sqft,'bedroom':bedroom_count,'price':price,'link':link})
#clean up results
post_results.drop_duplicates(subset='link')
post_results['footage'] = post_results['footage'].replace(0,np.nan)
post_results['bedroom'] = post_results['bedroom'].replace(0,np.nan)
post_results['neighborhood'] = post_results['neighborhood'].str.lstrip('(')
post_results['neighborhood'] = post_results['neighborhood'].str.rstrip(')')
post_results = post_results.dropna(subset=['footage','bedroom'],how='all')
post_results.to_csv("rent_clean.csv",index=False)
print(len(post_results.index))
This problem will happened when you have whitespace in the front
For example :
s=pd.Series([' (xxxx)','(yyyy) '])
s.str.strip('(|)')
0 (xxxx
1 yyyy)
dtype: object
What we can do is strip twice
s.str.strip().str.strip('(|)')
0 xxxx
1 yyyy
dtype: object
From my understanding of your question, you are removing characters from a string. You don't need pandas for this. Strings have a length and you can remove the first and last character like this;
new_word = old_word[1:-1]
This should work for you. Good luck.
Related
I have tried many ways to concatenate a list of DataFrames together but am continuously getting the error message "ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part."
At the moment the list only contains two elements, both of them being DataFrames. They do have different columns in places but i didn't think this would be an issue. At the moment I have:
df_year_stats = pd.concat(yearStats, axis = 0, ignore_index = True).reset_index(drop=True)
I don't think the dataframes have any lists in them but that is the only plausible solution I have thought of so far, if so how would I go about checking for these.
Any help would be greatly appreciated, thank you.
edit code:
import pandas as pd
from pandas.api.types import is_string_dtype
import requests
from bs4 import BeautifulSoup as bs
course_df = pd.read_csv("dg_course_table.csv")
soup = bs(requests.get('https://www.pgatour.com/stats/categories.ROTT_INQ.html').text, 'html.parser')
tabs = soup.find('div',attrs={'class','tabbable-head clearfix hidden-small'})
subStats = tabs.find_all('a')
# creating lists of tab and link, and removing the first and last
tab_links = []
tab_names = []
for subStat in subStats:
tab_names.append(subStat.text)
tab_links.append(subStat.get('href'))
tab_names = tab_names[1:-2] #potentially remove other areas here- points/rankings and streaks
tab_links = tab_links[1:-2]
# creating empty lists
stat_links = []
all_stat_names = []
# looping through each tab and extracting all of the stats URL's, along with the corresponding stat name.
for link in tab_links:
page2 = 'https://www.pgatour.com' + str(link)
req2 = requests.get(page2)
soup2 = bs(req2.text, 'html.parser')
# find correct part of html code
stat = soup2.find('section',attrs={'class','module-statistics-off-the-tee clearfix'})
specificStats = stat.find_all('a')
for stat in specificStats:
stat_links.append(stat.get('href'))
all_stat_names.append(stat.text)
s_asl = pd.Series(stat_links, index = all_stat_names )
s_asl = s_asl.drop(labels='show more')
s_asl = s_asl.str[:-4]
tourn_links = pd.Series([],dtype=('str'))
df_all_stats = []
req4 = requests.get('https://www.pgatour.com/content/pgatour/stats/stat.120.y2014.html')
soup4 = bs(req4.text, 'html.parser')
stat = soup4.find('select',attrs={'aria-label':'Available Tournaments'})
htm = stat.find_all('option')
for h in htm: #finding all tournament codes for the given year
z = pd.Series([h.get('value')],index=[h.text])
tourn_links = tourn_links.append(z)
yearStats = []
count = 0
for tournament in tourn_links[0:2]: # create stat tables for two different golf tournaments
print(tournament)
df1 = []
df_labels = []
for r in range(0,len(s_asl)): #loop through all stat links adding the corresponding stat to that tounaments df
try:
link = 'https://www.pgatour.com'+s_asl[r]+'y2014.eon.'+tournament+'.html'
web = pd.read_html(requests.get(link).text)
table = web[1].set_index('PLAYER NAME')
df1.append(table)
df_labels.append(s_asl.index[r])
except:
print("empty table")
try:
df_tourn_stats = pd.concat(df1,keys=df_labels,axis=1)
df_tourn_stats.reset_index(level=0, inplace=True)
df_tourn_stats.insert(1,'Tournament Name',tourn_links.index[count])
df_tourn_stats.to_csv(str(count) + ".csv")
df_tourn_stats = df_tourn_stats.loc[:,~df_tourn_stats.columns.duplicated()].copy()
yearStats.append(df_tourn_stats)
except:
print("NO DATA")
count= count + 1
#combine the stats of the two different tournaments into one dataframe
df_year_stats = pd.concat(yearStats, axis = 0, ignore_index = True).reset_index(drop=True)
I'll just list the two bugs I know as of now, and if you have any recommendations for refactoring my code let me know I'll go ahead and list out the few known issues as of now.
yfinance is not appending the dividendYield to my dict, I did make sure that their is an actual Dividend Yield for those Symbols.
TypeError: can only concatenate str (not "Tag") to str which I assume is something to do with how it parsing through the xml, and it ran into a tag so I am not able to create the expander, I thought I could solve it with this if statement, but instead I just don't get any expander at all.
with st.expander("Expand for stocks news"):
for heading in fin_headings:
if heading == str:
st.markdown("* " + heading)
else:
pass
Full code for main.py:
import requests
import spacy
import pandas as pd
import yfinance as yf
import streamlit as st
from bs4 import BeautifulSoup
st.title("Fire stocks :fire:")
nlp = spacy.load("en_core_web_sm")
def extract_rss(rss_link):
# Parses xml, and extracts the headings.
headings = []
response1 = requests.get(
"http://feeds.marketwatch.com/marketwatch/marketpulse/")
response2 = requests.get(rss_link)
parse1 = BeautifulSoup(response1.content, features="xml")
parse2 = BeautifulSoup(response2.content, features="xml")
headings1 = parse1.findAll('title')
headings2 = parse2.findAll('title')
headings = headings1 + headings2
return headings
def stock_info(headings):
# Get the entities from each heading, link it with nasdaq data // if possible, and Extract market data with yfinance.
stock_dict = {
'Org': [],
'Symbol': [],
'currentPrice': [],
'dayHigh': [],
'dayLow': [],
'forwardPE': [],
'dividendYield': []
}
stocks_df = pd.read_csv("./data/nasdaq_screener_1658383327100.csv")
for title in headings:
doc = nlp(title.text)
for ent in doc.ents:
try:
if stocks_df['Name'].str.contains(ent.text).sum():
symbol = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Symbol'].values[0]
org_name = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Name'].values[0]
# Recieve info from yfinance
stock_info = yf.Ticker(symbol).info
print(symbol)
stock_dict['Org'].append(org_name)
stock_dict['Symbol'].append(symbol)
stock_dict['currentPrice'].append(
stock_info['currentPrice'])
stock_dict['dayHigh'].append(stock_info['dayHigh'])
stock_dict['dayLow'].append(stock_info['dayLow'])
stock_dict['forwardPE'].append(stock_info['forwardPE'])
stock_dict['dividendYield'].append(
stock_info['dividendYield'])
else:
# If name can't be found pass.
pass
except:
# Don't raise an error.
pass
output_df = pd.DataFrame.from_dict(stock_dict, orient='index')
output_df = output_df.transpose()
return output_df
# Add input field input field
user_input = st.text_input(
"Add rss link here", "https://www.investing.com/rss/news.rss")
# Get financial headlines
fin_headings = extract_rss(user_input)
print(fin_headings)
# Output financial info
output_df = stock_info(fin_headings)
output_df.drop_duplicates(inplace=True, subset='Symbol')
st.dataframe(output_df)
with st.expander("Expand for stocks news"):
for heading in fin_headings:
if heading == str:
st.markdown("* " + heading)
else:
pass
There is an issue in your logic in stock_info function because of which same symbol is getting different values and when you are cleaning the duplicate, based on occurrence of the symbol its retaining the row with first occurrence of symbol.
The below code will solve both of your issues.
import requests
import spacy
import pandas as pd
import yfinance as yf
import streamlit as st
from bs4 import BeautifulSoup
st.title("Fire stocks :fire:")
nlp = spacy.load("en_core_web_sm")
def extract_rss(rss_link):
# Parses xml, and extracts the headings.
headings = []
response1 = requests.get(
"http://feeds.marketwatch.com/marketwatch/marketpulse/")
response2 = requests.get(rss_link)
parse1 = BeautifulSoup(response1.content, features="xml")
parse2 = BeautifulSoup(response2.content, features="xml")
headings1 = parse1.findAll('title')
headings2 = parse2.findAll('title')
headings = headings1 + headings2
return headings
def stock_info(headings):
stock_info_list = []
stocks_df = pd.read_csv("./data/nasdaq_screener_1658383327100.csv")
for title in headings:
doc = nlp(title.text)
for ent in doc.ents:
try:
if stocks_df['Name'].str.contains(ent.text).sum():
symbol = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Symbol'].values[0]
org_name = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Name'].values[0]
# Recieve info from yfinance
print(symbol)
stock_info = yf.Ticker(symbol).info
stock_info['Org'] = org_name
stock_info['Symbol'] = symbol
stock_info_list.append(stock_info)
else:
# If name can't be found pass.
pass
except:
# Don't raise an error.
pass
output_df = pd.DataFrame(stock_info_list)
return output_df
# Add input field input field
user_input = st.text_input(
"Add rss link here", "https://www.investing.com/rss/news.rss")
# Get financial headlines
fin_headings = extract_rss(user_input)
output_df = stock_info(fin_headings)
output_df = output_df[['Org','Symbol','currentPrice','dayHigh','dayLow','forwardPE','dividendYield']]
output_df.drop_duplicates(inplace=True, subset='Symbol')
st.dataframe(output_df)
with st.expander("Expand for stocks news"):
for heading in fin_headings:
heading = heading.text
if type(heading) == str:
st.markdown("* " + heading)
else:
pass
For issue #2 the patch code that you posted has a small mistake. Rather than checking if heading == str, which does something completely different than you intended and will always be False, you want to check if isinstance(heading, str). That way you get True if heading is a string and False if not. However, even then, it should not be a solution as heading is not a string. Instead you want to call get_text on heading to get the actual text part of the parsed object.
heading.get_text()
More information would be needed to solve issue #1. What does stock_dict look like before you create the Dataframe out of it? Specifically, what values are in stock_dict['dividendYield']? Can you print it and add it to your question?
Also, about the refactoring part. An
else:
pass
block does completely nothing and should be deleted. (When the if condition is false nothing happens anyways)
I am scraping a URL (example: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-4.html) and the number on the end of the URL is the page number. I am trying to scrape multiple pages, so I used the following code to loop through the multiple pages:
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
When I run the code in my script and print the page, it returns 4, 5 and 6, meaning that it should be working. However whenever I run the full code, it only gives me the results for the 6th page.
What I think may be happening is the code is finalizing on the last number and formatting that into the URL, whenever it should formatting each number into the URL instead.
I have tried looking at other people with similar issues but haven't been able to find a solution. I believe this may be a code formatting error but I am not exactly sure. Any advice is greatly appreciated. Thank you.
Here is the remainder of my code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
import os
import pandas as pd
import openpyxl
# define 1-1-2020 as a datetime object
after_date = datetime(2021, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
if last_out_str != "": # check to make sure the date field isn't empty
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z") # load date into datetime object for comparison
if last_out > after_date: # if check to see if the date is after last_out
address_links.append(url + '-full') #add adddress_links to the list, -full makes the link show all data
print(address_links)
for url in address_links: #loop through the urls in address_links list
r = s.get(url)
soup = bs(r.content, 'lxml')
ad2 = (soup.title.string) #grab the web title which is used for the filename
ad2 = ad2.replace('Dogecoin', '')
ad2 = ad2.replace('Address', '')
ad2 = ad2.replace('-', '')
filename = ad2.replace(' ', '')
sections = soup.find_all(class_='table-striped')
for section in sections: #This contains the data which is imported into the 'gf' dataframe or the 'info' xlsx sheet
oldprofit = section.find_all('td')[11].text #Get the profit
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
balance = section.find_all('td')[0].text #Get the wallet balance
amount_recieved = section.find_all('td')[3].text #Get amount recieved
ins = amount_recieved[amount_recieved.find('(') + 1:amount_recieved.find(')')] #Filter out text from
# amount recieved
ins = ins.replace('ins', '')
ins = ins.replace(' ', '')
ins = float(ins)
first_recieved = section.find_all('td')[4].text #Get the data of the first incoming transaction
fr = first_recieved.replace('first', '')
fr = fr.replace(':', '')
fr = fr.replace(' ', '')
last_recieved = section.find_all('td')[5].text #Get the date of the last incoming transaction
lr = last_recieved.replace('last', '')
lr = lr.replace(':', '')
lr = lr.replace(' ', '')
amount_sent = section.find_all('td')[7].text #Get the amount sent
outs = amount_sent[amount_sent.find('(') + 1:amount_sent.find(')')] #Filter out the text
outs = outs.replace('outs', '')
outs = outs.replace(' ', '')
outs = float(outs)
first_sent = section.find_all('td')[8].text #Get the first outgoing transaction date
fs = first_sent.replace('first', '') #clean up first outgoing transaction date
fs = fs.replace(':', '')
fs = fs.replace(' ', '')
last_sent = section.find_all('td')[9].text #Get the last outgoing transaction date
ls = last_sent.replace('last', '') #Clean up last outgoing transaction date
ls = ls.replace(':', '')
ls = ls.replace(' ', '')
dbalance = section.find_all('td')[0].select('b') #get the balance of doge
dusd = section.find_all('td')[0].select('span')[1] #get balance of USD
for data in dbalance: #used to clean the text up
balance = data.text
for data1 in dusd: #used to clean the text up
usd = data1.text
# Compare profit to goal, if profit doesn't meet the goal, the URL is not scraped
goal = float(30000)
if profit < goal:
continue
#Select wallets with under 2000 transactions
trans = float(ins + outs) #adds the amount of incoming and outgoing transactions
trans_limit = float(2000)
if trans > trans_limit:
continue
# Create Info Dataframe using the data from above
info = {
'Balance': [balance],
'USD Value': [usd],
'Wallet Profit': [profit],
'Amount Recieved': [amount_recieved],
'First Recieved': [fr],
'Last Recieved': [lr],
'Amount Sent': [amount_sent],
'First Sent': [fs],
'Last Sent': [ls],
}
gf = pd.DataFrame(info)
a = 'a'
if a:
df = \
pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text, attrs={"id": "table_maina"},
index_col=None, header=[0])[0] #uses pandas to read the dataframe and save it
directory = '/Users/chris/Desktop/Files' #directory for the file to go to
file = f'{filename}.xlsx'
writer = pd.ExcelWriter(os.path.join(directory, file), engine='xlsxwriter')
with pd.ExcelWriter(writer) as writer:
df.to_excel(writer, sheet_name='transactions')
gf.to_excel(writer, sheet_name='info')
Check your indentation - In your question the loops are on the same level, so loop that make the requests is iterating over all the pages but results are never processed until iterating is done. That is why it only works for the last page.
Move your loops, that should handle the response and extract elements into your first loop:
...
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
...
for url in address_links:
...
I am trying to web scrape stock data using a for loop on a list of five stocks. The problem is only the first value is returned five times. I have tried appending to a list but it still doesn't work, although clearly I am not appending correctly. On the website, I want to get the data for Operating Cash which comes in the form of 14B or 1B for example, which is why I have removed the B and multiplied that number to get a raw value. Here is my code:
import requests
import yfinance as yf
import pandas as pd
from bs4 import BeautifulSoup
headers = {'User Agent':'Mozilla/5.0'}
stocks = ['AMC','AMD','PFE','AAPL', 'NVDA']
finished_list = []
for stock in stocks:
url = f'https://www.marketwatch.com/investing/stock/{stock}/financials/cash-flow'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml')
operating_cash = soup.findAll('div', class_ = "cell__content")[134].text
finished_list.append(operating_cash)
if 'B' in operating_cash:
cash1 = operating_cash.replace('B','')
if '(' in cash1:
cash2 = cash1.replace('(','-')
if ')' in cash2:
cash3 = cash2.replace(')','')
cash3 = float(cash3)
print(cash3*1000000000)
else:
cash1 = float(cash1)
print(cash1 * 1000000000)
The current output is -1060000000.0 five times in a row which is the correct value for operating cash for AMC but not for the other four. Thanks in advance to anyone who can help me out.
You don't need to use if conditions for str.replace(). Instead, do all your replacements in one line like so:
for stock in stocks:
url = f'https://www.marketwatch.com/investing/stock/{stock}/financials/cash-flow'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml')
operating_cash = soup.findAll('div', class_ = "cell__content")[134].text
finished_list.append(operating_cash)
cash = float(operating_cash.replace('B','').replace('(','-').replace(')',''))
print(cash*1000000000)
-1060000000.0
1070000000.0000001
14400000000.0
80670000000.0
5820000000.0
I am trying to build a dataframe, in which this attempt grabs data and column from dicts. (I tried doing this with pd.Series but I kept running into issues there, as well.)
import requests
import pandas as pd
from bs4 import BeautifulSoup
# get link and parse
page = requests.get('https://www.finviz.com/screener.ashx?v=111&ft=4')
soup = BeautifulSoup(page.text, 'html.parser')
# return 'Title's for each filter
# to be used as columns in dataframe
titles = soup.find_all('span', attrs={'class': 'screener-combo-title'})
title_list = []
for t in titles:
t = t.stripped_strings
t = ' '.join(t)
title_list.append(t)
title_list = {k: v for k, v in enumerate(title_list)}
# finding filters-cells tag id's
# to be used to build url
filters = soup.find_all('select', attrs={'data-filter': True})
filter_list = []
for f in filters:
filter_list.append(f.get('data-filter'))
# finding selectable values per cell
# to be used as data in dataframe
final_list = []
for f in filters:
options = f.find_all('option', attrs={'value': True})
option_list = [] # list needs to stay inside
for option in options:
if option['value'] != "":
option_list.append(option['value'])
final_list.append(option_list)
final_list = {k: v for k, v in enumerate(final_list)}
df = pd.DataFrame([final_list], columns=[title_list])
print(df)
This results in TypeError: unhashable type: 'dict' An example would look like (the first column is NOT the index):
Exchange Index ...
amex s&p500 ...
nasd djia
nyse
Here is an attempt to build a dict where key corresponds to filter values, and value corresponds to a list of possible choices. Does it suit your needs?
import requests
import pandas as pd
from bs4 import BeautifulSoup
# get link and parse
page = requests.get('https://www.finviz.com/screener.ashx?v=111&ft=4')
soup = BeautifulSoup(page.text, 'html.parser')
all_dict = {}
filters = soup.find_all('td', attrs={'class': 'filters-cells'})
for i in range(len(filters) // 2):
i_title = 2 * i
i_value = 2 * i + 1
sct = filters[i_title].find_all('span', attrs={'class': 'screener-combo-title'})
if len(sct)== 1:
title = ' '.join(sct[0].stripped_strings)
values = [v.text for v in filters[i_value].find_all('option', attrs={'value': True}) if v.text]
all_dict[title] = values
max_element = max([len(v) for v in all_dict.values()])
for k in all_dict:
all_dict[k] = all_dict[k] + [''] * (max_element - len(all_dict[k]))
df = pd.DataFrame.from_dict(all_dict)