I am scraping a URL (example: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-4.html) and the number on the end of the URL is the page number. I am trying to scrape multiple pages, so I used the following code to loop through the multiple pages:
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
When I run the code in my script and print the page, it returns 4, 5 and 6, meaning that it should be working. However whenever I run the full code, it only gives me the results for the 6th page.
What I think may be happening is the code is finalizing on the last number and formatting that into the URL, whenever it should formatting each number into the URL instead.
I have tried looking at other people with similar issues but haven't been able to find a solution. I believe this may be a code formatting error but I am not exactly sure. Any advice is greatly appreciated. Thank you.
Here is the remainder of my code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
import os
import pandas as pd
import openpyxl
# define 1-1-2020 as a datetime object
after_date = datetime(2021, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
if last_out_str != "": # check to make sure the date field isn't empty
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z") # load date into datetime object for comparison
if last_out > after_date: # if check to see if the date is after last_out
address_links.append(url + '-full') #add adddress_links to the list, -full makes the link show all data
print(address_links)
for url in address_links: #loop through the urls in address_links list
r = s.get(url)
soup = bs(r.content, 'lxml')
ad2 = (soup.title.string) #grab the web title which is used for the filename
ad2 = ad2.replace('Dogecoin', '')
ad2 = ad2.replace('Address', '')
ad2 = ad2.replace('-', '')
filename = ad2.replace(' ', '')
sections = soup.find_all(class_='table-striped')
for section in sections: #This contains the data which is imported into the 'gf' dataframe or the 'info' xlsx sheet
oldprofit = section.find_all('td')[11].text #Get the profit
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
balance = section.find_all('td')[0].text #Get the wallet balance
amount_recieved = section.find_all('td')[3].text #Get amount recieved
ins = amount_recieved[amount_recieved.find('(') + 1:amount_recieved.find(')')] #Filter out text from
# amount recieved
ins = ins.replace('ins', '')
ins = ins.replace(' ', '')
ins = float(ins)
first_recieved = section.find_all('td')[4].text #Get the data of the first incoming transaction
fr = first_recieved.replace('first', '')
fr = fr.replace(':', '')
fr = fr.replace(' ', '')
last_recieved = section.find_all('td')[5].text #Get the date of the last incoming transaction
lr = last_recieved.replace('last', '')
lr = lr.replace(':', '')
lr = lr.replace(' ', '')
amount_sent = section.find_all('td')[7].text #Get the amount sent
outs = amount_sent[amount_sent.find('(') + 1:amount_sent.find(')')] #Filter out the text
outs = outs.replace('outs', '')
outs = outs.replace(' ', '')
outs = float(outs)
first_sent = section.find_all('td')[8].text #Get the first outgoing transaction date
fs = first_sent.replace('first', '') #clean up first outgoing transaction date
fs = fs.replace(':', '')
fs = fs.replace(' ', '')
last_sent = section.find_all('td')[9].text #Get the last outgoing transaction date
ls = last_sent.replace('last', '') #Clean up last outgoing transaction date
ls = ls.replace(':', '')
ls = ls.replace(' ', '')
dbalance = section.find_all('td')[0].select('b') #get the balance of doge
dusd = section.find_all('td')[0].select('span')[1] #get balance of USD
for data in dbalance: #used to clean the text up
balance = data.text
for data1 in dusd: #used to clean the text up
usd = data1.text
# Compare profit to goal, if profit doesn't meet the goal, the URL is not scraped
goal = float(30000)
if profit < goal:
continue
#Select wallets with under 2000 transactions
trans = float(ins + outs) #adds the amount of incoming and outgoing transactions
trans_limit = float(2000)
if trans > trans_limit:
continue
# Create Info Dataframe using the data from above
info = {
'Balance': [balance],
'USD Value': [usd],
'Wallet Profit': [profit],
'Amount Recieved': [amount_recieved],
'First Recieved': [fr],
'Last Recieved': [lr],
'Amount Sent': [amount_sent],
'First Sent': [fs],
'Last Sent': [ls],
}
gf = pd.DataFrame(info)
a = 'a'
if a:
df = \
pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text, attrs={"id": "table_maina"},
index_col=None, header=[0])[0] #uses pandas to read the dataframe and save it
directory = '/Users/chris/Desktop/Files' #directory for the file to go to
file = f'{filename}.xlsx'
writer = pd.ExcelWriter(os.path.join(directory, file), engine='xlsxwriter')
with pd.ExcelWriter(writer) as writer:
df.to_excel(writer, sheet_name='transactions')
gf.to_excel(writer, sheet_name='info')
Check your indentation - In your question the loops are on the same level, so loop that make the requests is iterating over all the pages but results are never processed until iterating is done. That is why it only works for the last page.
Move your loops, that should handle the response and extract elements into your first loop:
...
for page in range(4, 7): #Range designates the page numbers for the URL
r = s.get(f'https://bitinfocharts.com/top-100-richest-dogecoin-addresses-{page}.html') #Format the page number into url
print(page)
soup = bs(r.content, 'lxml')
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
...
for url in address_links:
...
Related
as of right now i have a working code which is a web scraper that logs into indeed job search site. My issue now is tha I need to create a csv file that shows every single job position that was found, it gives me the numer of positions available and the description of one of them. Hope i can get some help, I would greatly apreciate it.
import re
import csv
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
jk_pattern = re.compile(r"jk:\'([a-zA-Z0-9]+)'")
params = { "q": "mechanical+engineer", "l": "united+states", "start": 0 }
url = "https://www.indeed.com/jobs"
job_keys = set()
for x in range(10):
response = requests.get(url, params=params)
if not response.status_code == 200:
break
else:
keys = jk_pattern.findall(response.text)
if len(keys) > 0:
for key in keys:
job_keys.add(key)
params['start'] += 20
sleep(randint(0, 3))
len(job_keys)
template = "https://www.indeed.com/viewjob?jk={}"
jk = job_keys.pop()
job_url = template.format(jk)
response = requests.get(job_url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.find("div", id="jobDescriptionText").text)
def get_record(card):
"""Extract job data from a single record"""
job_title = card.h2.a.get('title')
company = card.find('span', 'company').text.strip()
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
job_url = 'https://www.indeed.com' + card.h2.a.get('href')
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
salary = salary_tag.text.strip()
else:
salary = ''
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
return record
def main(position, location):
"""Run the main program routine"""
records = []
url = get_url(position, location)
# extract the job data
while True:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', 'jobsearch-SerpJobCard')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
except AttributeError:
break
# save the job data
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
writer.writerows(records)
I am working on a screen scraper to pull football statistics down from www.pro-football-reference.com. I'm currently scraping from the main player's stat page and then diving into their individual page with their statistics by year.
I was able to implement this process successfully with my first set of players (quarterbacks, using the Passing Table). However, when I attempted to re-create the process to get running back data, I am reciving an additional column in my data frame with the values "Unnamed: x_level_0". This is my first experience with HTML data so I'm not sure what piece I missed, I just assumed it would be the same code as the quarterbacks.
Below is the QB Code sample and the correct dataframe:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import lxml
import re
import csv
p = 1
url = 'https://www.pro-football-reference.com'
year = 2020
maxp = 300
#Passing Data
r = requests.get(url+ '/years/' + str(year) + '/passing.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]
results = soup.find(id='div_passing')
job_elems = results.find_all('tr')
df = []
LastNameList = []
FirstNameList = []
for i,row in enumerate(parsed_table.find_all('tr')[2:]):
dat = row.find('td', attrs={'data-stat': 'player'})
if dat != None:
name = dat.a.get_text()
print(name)
stub = dat.a.get('href')
#pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
#print(pos)
# grab this players stats
tdf = pd.read_html(url + stub)[1]
for k,v in tdf.iterrows():
#Scrape 2020 stats, if no 2020 stats move on
try:
FindYear=re.search(".*2020.*",v['Year'])
if FindYear:
#If Year for stats is current year append data to dataframe
#get Name data
fullName = row.find('td', {'class':'left'})['csk']
findComma = fullName.find(',',0,len(fullName))
lName = fullName[0:findComma]
fName = fullName[findComma + 1:len(fullName)]
LastNameList.append(lName)
FirstNameList.append(fName)
#get basic stats
df.append(v)
except:
pass
This output looks like the following:
Philip Rivers
Year 2020
Age 39
Tm IND
Pos qb
No. 17
G 1
GS 1
Below is the RB Code sample and the incorrect dataframe:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import lxml
import re
import csv
p = 1
url = 'https://www.pro-football-reference.com'
year = 2020
maxp = 300
#Rushing Data
r = requests.get(url+ '/years/' + str(year) + '/rushing.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]
results = soup.find(id='div_rushing')
job_elems = results.find_all('tr')
df = []
LastNameList = []
FirstNameList = []
for i,row in enumerate(parsed_table.find_all('tr')[2:]):
dat = row.find('td', attrs={'data-stat': 'player'})
if dat != None:
name = dat.a.get_text()
print(name)
stub = dat.a.get('href')
print(stub)
#pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
#print(pos)
# grab this players stats
tdf = pd.read_html(url + stub)[1]
for k,v in tdf.iterrows():
print(v)
#Scrape 2020 stats, if no 2020 stats move on
try:
FindYear=re.search(".*2020.*",v['Year'])
print('found 2020')
if FindYear:
#If Year for stats is current year append data to dataframe
#get Name data
fullName = row.find('td', {'class':'left'})['csk']
findComma = fullName.find(',',0,len(fullName))
lName = fullName[0:findComma]
fName = fullName[findComma + 1:len(fullName)]
LastNameList.append(lName)
FirstNameList.append(fName)
#get basic stats
df.append(v)
except:
pass
This output looks like the following:
Unnamed: 0_level_0 Year 2020
Unnamed: 1_level_0 Age 26
Unnamed: 2_level_0 Tm TEN
Unnamed: 3_level_0 Pos rb
Unnamed: 4_level_0 No. 22
Games G 1
GS 1
Rushing Rush 31
Yds 116
TD 0
An example URL where this data is pulled from is: https://www.pro-football-reference.com/players/J/JacoJo01.htm
And it is pulling the Rushing & Receiving. Is there something additional I need to be on the lookout for when it comes to parsing HTML?
I attempted to add index_col = 1 into my tdf = pd.read_html(url + stub)[1]. However, that just kind of grouped the two values into one column.
Any input on this would be greatly appreciated. If I can provide any further information, please let me know.
Thank you
You can try this code to parse the table passing for each player (Now I get the players from https://www.pro-football-reference.com/years/2020/passing.htm but you can pass any player URL to it:
import requests
from bs4 import BeautifulSoup
def scrape_player(player_name, player_url, year="2020"):
out = []
soup = BeautifulSoup(requests.get(player_url).content, 'html.parser')
row = soup.select_one('table#passing tr:has(th:contains("{}"))'.format(year))
if row:
tds = [player_name] + [t.text for t in row.select('th, td')]
headers = ['Name'] + [th.text for th in row.find_previous('thead').select('th')]
out.append(dict(zip(headers, tds)))
return out
url = 'https://www.pro-football-reference.com/years/2020/passing.htm'
all_data = []
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for player in soup.select('table#passing [data-stat="player"] a'):
print(player.text)
for data in scrape_player(player.text, 'https://www.pro-football-reference.com' + player['href']):
all_data.append(data)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)
Creates this csv:
EDIT: To parse Rushing&Receiving, you can use this script:
import requests
from bs4 import BeautifulSoup, Comment
def scrape_player(player_name, player_url, year="2020"):
out = []
soup = BeautifulSoup(requests.get(player_url).content, 'html.parser')
soup = BeautifulSoup(soup.select_one('#rushing_and_receiving_link').find_next(text=lambda t: isinstance(t, Comment)), 'html.parser')
row = soup.select_one('table#rushing_and_receiving tr:has(th:contains("{}"))'.format(year))
if row:
tds = [player_name] + [t.text for t in row.select('th, td')]
headers = ['Name'] + [th.text for th in row.find_previous('thead').select('tr')[-1].select('th')]
out.append(dict(zip(headers, tds)))
return out
url = 'https://www.pro-football-reference.com/years/2020/passing.htm'
all_data = []
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for player in soup.select('table#passing [data-stat="player"] a'):
print(player.text)
for data in scrape_player(player.text, 'https://www.pro-football-reference.com' + player['href']):
all_data.append(data)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)
Creates this CSV:
currently i am printing the data.now rather than printing i want to export to
excel./csv new to python pls help.
**data is very huge around 9000 rows with 6 columns?**
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
def scrape_bid_data():
page_no = 1 #initial page number
while True:
print('Hold on creating URL to fetch data...')
URL = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no) #create dynamic URL
print('URL cerated: ' + URL)
scraped_data = requests.get(URL,verify=False) # request to get the data
soup_data = bs(scraped_data.text, 'lxml') #parse the scraped data using lxml
extracted_data = soup_data.find('div',{'id':'pagi_content'}) #find divs which contains required data
if len(extracted_data) == 0: # **if block** which will check the length of extracted_data if it is 0 then quit and stop the further execution of script.
break
else:
for idx in range(len(extracted_data)): # loops through all the divs and extract and print data
if(idx % 2 == 1): #get data from odd indexes only because we have required data on odd indexes
bid_data = extracted_data.contents[idx].text.strip().split('\n')
print('-' * 100)
print(bid_data[0]) #BID number
print(bid_data[5]) #Items
print(bid_data[6]) #Quantitiy Required
print(bid_data[10] + bid_data[12].strip()) #Department name and address
print(bid_data[16]) #Start date
print(bid_data[17]) #End date
print('-' * 100)
page_no +=1 #increments the page number by 1
scrape_bid_data()
I think you should start by returning the extract_data object containing your data at the end of your function.
page_no = 1
def scrap_bid_data(page):
print('Hold on creating URL to fetch data...')
URL = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page)
print('URL cerated: ' + URL)
scraped_data = requests.get(URL,verify=False) # request to get the data
soup_data = bs(scraped_data.text, 'lxml') #parse the scraped data using lxml
extracted_data = soup_data.find('div',{'id':'pagi_content'})
return extracted_data
Then use it to create a dataframe
extract_data = scrap_bid_data(page_no)
import pandas as pd
df = pd.DataFrame(extract_data)
and then export this fataframe.
df.to_csv ('file_name_{}'.format(page_no))
I am working on a project, and I need to remove the left and right most character of a data result. The data forms a scrape of craigslist, and the neighborhood results return as '(####)', but what I need it to be is ####. I am using pandas, and trying to use lstrip & rstrip. When I attempt it inside the python shell, it works, but when I use it on my data it does not work.
post_results['neighborhood'] = post_results['neighborhood'].str.lstrip('(')
post_results['neighborhood'] = post_results['neighborhood'].str.rstrip(')')
For some reason, the rstrip, does work and removes the ')' but the lstrip does not.
The full code is:
from bs4 import BeautifulSoup
import json
from requests import get
import numpy as np
import pandas as pd
import csv
print('hello world')
#get the initial page for the listings, to get the total count
response = get('https://washingtondc.craigslist.org/search/hhh?query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
results = html_result.find('div', class_='search-legend')
total = int(results.find('span',class_='totalcount').text)
pages = np.arange(0,total+1,120)
neighborhood = []
bedroom_count =[]
sqft = []
price = []
link = []
for page in pages:
#print(page)
response = get('https://washingtondc.craigslist.org/search/hhh?s='+str(page)+'query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
posts = html_result.find_all('li', class_='result-row')
for post in posts:
if post.find('span',class_='result-hood') is not None:
post_url = post.find('a',class_='result-title hdrlnk')
post_link = post_url['href']
link.append(post_link)
post_neighborhood = post.find('span',class_='result-hood').text
post_price = int(post.find('span',class_='result-price').text.strip().replace('$',''))
neighborhood.append(post_neighborhood)
price.append(post_price)
if post.find('span',class_='housing') is not None:
if 'ft2' in post.find('span',class_='housing').text.split()[0]:
post_bedroom = np.nan
post_footage = post.find('span',class_='housing').text.split()[0][:-3]
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
elif len(post.find('span',class_='housing').text.split())>2:
post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
post_footage = post.find('span',class_='housing').text.split()[2][:-3]
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
elif len(post.find('span',class_='housing').text.split())==2:
post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
post_footage = np.nan
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
else:
post_bedroom = np.nan
post_footage = np.nan
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
#create results data frame
post_results = pd.DataFrame({'neighborhood':neighborhood,'footage':sqft,'bedroom':bedroom_count,'price':price,'link':link})
#clean up results
post_results.drop_duplicates(subset='link')
post_results['footage'] = post_results['footage'].replace(0,np.nan)
post_results['bedroom'] = post_results['bedroom'].replace(0,np.nan)
post_results['neighborhood'] = post_results['neighborhood'].str.lstrip('(')
post_results['neighborhood'] = post_results['neighborhood'].str.rstrip(')')
post_results = post_results.dropna(subset=['footage','bedroom'],how='all')
post_results.to_csv("rent_clean.csv",index=False)
print(len(post_results.index))
This problem will happened when you have whitespace in the front
For example :
s=pd.Series([' (xxxx)','(yyyy) '])
s.str.strip('(|)')
0 (xxxx
1 yyyy)
dtype: object
What we can do is strip twice
s.str.strip().str.strip('(|)')
0 xxxx
1 yyyy
dtype: object
From my understanding of your question, you are removing characters from a string. You don't need pandas for this. Strings have a length and you can remove the first and last character like this;
new_word = old_word[1:-1]
This should work for you. Good luck.
I am importing links to boxscores from this webpage
http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html
This is how I am doing it now. I get the links from the first page.
url = 'http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/wnba/teams/pastresults/2012/team665231.html'
boxurl = urllib2.urlopen(url).read()
soup = BeautifulSoup(boxurl)
boxscores = soup.findAll('a', href=re.compile('boxscore'))
basepath = "http://www.covers.com"
pages=[] # This grabs the links from the page
for a in boxscores:
pages.append(urllib2.urlopen(basepath + a['href']).read())
Then in a new window I do this.
newsoup = pages[1] # I am manually changing this every time
soup = BeautifulSoup(newsoup)
def _unpack(row, kind='td'):
return [val.text for val in row.findAll(kind)]
tables = soup('table')
linescore = tables[1]
linescore_rows = linescore.findAll('tr')
roadteamQ1 = float(_unpack(linescore_rows[1])[1])
roadteamQ2 = float(_unpack(linescore_rows[1])[2])
roadteamQ3 = float(_unpack(linescore_rows[1])[3])
roadteamQ4 = float(_unpack(linescore_rows[1])[4]) # add OT rows if ???
roadteamFinal = float(_unpack(linescore_rows[1])[-3])
hometeamQ1 = float(_unpack(linescore_rows[2])[1])
hometeamQ2 = float(_unpack(linescore_rows[2])[2])
hometeamQ3 = float(_unpack(linescore_rows[2])[3])
hometeamQ4 = float(_unpack(linescore_rows[2])[4]) # add OT rows if ???
hometeamFinal = float(_unpack(linescore_rows[2])[-3])
misc_stats = tables[5]
misc_stats_rows = misc_stats.findAll('tr')
roadteam = str(_unpack(misc_stats_rows[0])[0]).strip()
hometeam = str(_unpack(misc_stats_rows[0])[1]).strip()
datefinder = tables[6]
datefinder_rows = datefinder.findAll('tr')
date = str(_unpack(datefinder_rows[0])[0]).strip()
year = 2012
from dateutil.parser import parse
parsedDate = parse(date)
date = parsedDate.replace(year)
month = parsedDate.month
day = parsedDate.day
modDate = str(day)+str(month)+str(year)
gameid = modDate + roadteam + hometeam
data = {'roadteam': [roadteam],
'hometeam': [hometeam],
'roadQ1': [roadteamQ1],
'roadQ2': [roadteamQ2],
'roadQ3': [roadteamQ3],
'roadQ4': [roadteamQ4],
'homeQ1': [hometeamQ1],
'homeQ2': [hometeamQ2],
'homeQ3': [hometeamQ3],
'homeQ4': [hometeamQ4]}
globals()["%s" % gameid] = pd.DataFrame(data)
df = pd.DataFrame.load('df')
df = pd.concat([df, globals()["%s" % gameid]])
df.save('df')
How can I automate this so I don't have to manually change newsoup = pages[1] manually and scrape all of the boxscores that are linked from the first url in one go. I am pretty new to python and lacking in some understanding of the basics.
So in the first code box you collect the pages
So in the second code box you have to loop this, if I understood it
for page in pages:
soup = BeautifulSoup(page)
# rest of the code here