LOOP When scraping data

LOOP When scraping data - python

i am trying to scrape data using loop and this is the code
import requests
import json
import pandas as pd
parameters = ['a:1','a:2','a:3','a:4','a:3','a:4','a:5','a:6','a:7','a:8','a:9','a:10']
results = pd.DataFrame()
for item in parameters:
key, value = item.split(':')
url = "https://xxxx.000webhostapp.com/getNamesEnc02Motasel2.php?keyword=%s&type=2&limit=%s" %(key, value)
r = requests.get(url)
cont = json.loads(r.content)
temp_df = pd.DataFrame(cont)
results = results.append(temp_df)
results.to_csv('ScrapeData.csv', index=False)
this method is working great but the problem is that there i need the parameters = until 'a:1000' and i think there is a better solution to loop from 'a:1' to 'a:1000' instead of duplicating parameters like in my code .
i really need your help

Use can use a for i in range(start, end) loop. Like this
results = pd.DataFrame()
key = 'a'
# Goes from 1 to 1000 (including both)
for value in range(1, 1001):
url = f'https://xxxx.000webhostapp.com/getNamesEnc02Motasel2.php?keyword={key}&type=2&limit={value}'
r = requests.get(url)
cont = json.loads(r.content)
temp_df = pd.DataFrame(cont)
results = results.append(temp_df)
results.to_csv('ScrapeData.csv', index=False)

value = 1
key = 'a'
while value <= 1000:
url = .....%(key, str(value))
....
....
value += 1
......
Use a counter

Related

ValueError: setting an array element with a sequence. For pandas.concat

I have tried many ways to concatenate a list of DataFrames together but am continuously getting the error message "ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part."
At the moment the list only contains two elements, both of them being DataFrames. They do have different columns in places but i didn't think this would be an issue. At the moment I have:
df_year_stats = pd.concat(yearStats, axis = 0, ignore_index = True).reset_index(drop=True)
I don't think the dataframes have any lists in them but that is the only plausible solution I have thought of so far, if so how would I go about checking for these.
Any help would be greatly appreciated, thank you.
edit code:
import pandas as pd
from pandas.api.types import is_string_dtype
import requests
from bs4 import BeautifulSoup as bs
course_df = pd.read_csv("dg_course_table.csv")
soup = bs(requests.get('https://www.pgatour.com/stats/categories.ROTT_INQ.html').text, 'html.parser')
tabs = soup.find('div',attrs={'class','tabbable-head clearfix hidden-small'})
subStats = tabs.find_all('a')
# creating lists of tab and link, and removing the first and last
tab_links = []
tab_names = []
for subStat in subStats:
tab_names.append(subStat.text)
tab_links.append(subStat.get('href'))
tab_names = tab_names[1:-2] #potentially remove other areas here- points/rankings and streaks
tab_links = tab_links[1:-2]
# creating empty lists
stat_links = []
all_stat_names = []
# looping through each tab and extracting all of the stats URL's, along with the corresponding stat name.
for link in tab_links:
page2 = 'https://www.pgatour.com' + str(link)
req2 = requests.get(page2)
soup2 = bs(req2.text, 'html.parser')
# find correct part of html code
stat = soup2.find('section',attrs={'class','module-statistics-off-the-tee clearfix'})
specificStats = stat.find_all('a')
for stat in specificStats:
stat_links.append(stat.get('href'))
all_stat_names.append(stat.text)
s_asl = pd.Series(stat_links, index = all_stat_names )
s_asl = s_asl.drop(labels='show more')
s_asl = s_asl.str[:-4]
tourn_links = pd.Series([],dtype=('str'))
df_all_stats = []
req4 = requests.get('https://www.pgatour.com/content/pgatour/stats/stat.120.y2014.html')
soup4 = bs(req4.text, 'html.parser')
stat = soup4.find('select',attrs={'aria-label':'Available Tournaments'})
htm = stat.find_all('option')
for h in htm: #finding all tournament codes for the given year
z = pd.Series([h.get('value')],index=[h.text])
tourn_links = tourn_links.append(z)
yearStats = []
count = 0
for tournament in tourn_links[0:2]: # create stat tables for two different golf tournaments
print(tournament)
df1 = []
df_labels = []
for r in range(0,len(s_asl)): #loop through all stat links adding the corresponding stat to that tounaments df
try:
link = 'https://www.pgatour.com'+s_asl[r]+'y2014.eon.'+tournament+'.html'
web = pd.read_html(requests.get(link).text)
table = web[1].set_index('PLAYER NAME')
df1.append(table)
df_labels.append(s_asl.index[r])
except:
print("empty table")
try:
df_tourn_stats = pd.concat(df1,keys=df_labels,axis=1)
df_tourn_stats.reset_index(level=0, inplace=True)
df_tourn_stats.insert(1,'Tournament Name',tourn_links.index[count])
df_tourn_stats.to_csv(str(count) + ".csv")
df_tourn_stats = df_tourn_stats.loc[:,~df_tourn_stats.columns.duplicated()].copy()
yearStats.append(df_tourn_stats)
except:
print("NO DATA")
count= count + 1
#combine the stats of the two different tournaments into one dataframe
df_year_stats = pd.concat(yearStats, axis = 0, ignore_index = True).reset_index(drop=True)

Kraken API trades endpoint flawed data

i wrote a script to get historical data from the public trades endpoint of the Kraken API, code as follows:
import pandas as pd
import json
import time
import urllib.request
def get_data(pair, since, until):
global data
global query
global json_response
global api_data
data_columns= ["price", "volume", "time", "buy/sell", "market/limit", "miscellaneous"]
data = pd.DataFrame(columns= data_columns)
api_start = since
app_start_time = time.time()
counter = 1
while api_start < until:
last_time = time.time()
api_domain = "https://api.kraken.com/0/public/Trades" + \
"?pair=%(pair)s&since=%(since)s" % {"pair":pair, "since": api_start}
api_request = urllib.request.Request(api_domain)
try:
api_data = urllib.request.urlopen(api_request).read()
except Exception:
time.sleep(3)
api_data = json.loads(api_data)
if len(api_data["error"]) != 0:
print(api_data["error"])
time.sleep(3)
continue
query = pd.DataFrame(api_data["result"][pair], columns = data_columns)
data = data.append(query, ignore_index= True)
api_start = int(api_data["result"]["last"][:10])
counter +=1
time.sleep(1)
print("Request number: %s" %counter)
print("Time since start: %s minutes" % round((time.time() - app_start_time)/60,2))
print("Time since last request: %s seconds" % round((time.time() - last_time),2))
print("last: %s" %api_start)
print("")
get_data("XXBTZUSD", 1414761200, 1455761200)
After some successful responses, i get flawed responses, looking like this:
As you can see, at some point, the UNIX time stamp simply jumps from 142894080.33775 to 1654992002.801943 and thus resulting in wrong data.
Is that a problem with my code or with the API?
Thanks in advance.

Taking the liberty to simplify your code I cannot confirm your observation. I get proper timestamps.
Try this:
import requests
def get_data(pair, since):
url = f"https://api.kraken.com/0/public/Trades?pair={pair}&since={since}"
api_data = requests.get(url)
api_data = json.loads(api_data.content)
return api_data
results = get_data("XBTUSD", 1414761200)
columns= ["price", "volume", "time", "buy/sell", "market/limit", "miscellaneous"]
df = pd.DataFrame(results["result"]["XXBTZUSD"], columns=columns)
df.time = df.time.astype(int)
df.head()
Print out:
price volume time buy/sell market/limit miscellaneous
0 340.09209 0.02722956 1414815678 s m
1 340.15346 0.21604000 1414820304 s m
2 340.00000 0.03395999 1414820304 s m
3 340.00001 0.01000000 1414821818 s l
4 340.00000 0.25668009 1414821818 s l
Edit:
Using pagination I can confirm the jump in timestamps. The problem very likely lies with the API.
def get_data(pair, since):
url = f"https://api.kraken.com/0/public/Trades?pair={pair}&since={since}"
api_data = requests.get(url)
api_data = json.loads(api_data.content)
return api_data
start_ts = 1414761200
frames = []
for _ in range(30):
print(start_ts)
print(datetime.fromtimestamp(int(start_ts)))
tmp = get_data("XBTUSD", start_ts)
start_ts = tmp["result"]["last"][:10]
frames.append(pd.DataFrame(results["result"]["XXBTZUSD"]))
time.sleep(2)
Print out after a couple of iterations:
1438313128
2015-07-31 05:25:28
1653648031
2022-05-27 12:40:31

python pandas remove character

I am working on a project, and I need to remove the left and right most character of a data result. The data forms a scrape of craigslist, and the neighborhood results return as '(####)', but what I need it to be is ####. I am using pandas, and trying to use lstrip & rstrip. When I attempt it inside the python shell, it works, but when I use it on my data it does not work.
post_results['neighborhood'] = post_results['neighborhood'].str.lstrip('(')
post_results['neighborhood'] = post_results['neighborhood'].str.rstrip(')')
For some reason, the rstrip, does work and removes the ')' but the lstrip does not.
The full code is:
from bs4 import BeautifulSoup
import json
from requests import get
import numpy as np
import pandas as pd
import csv
print('hello world')
#get the initial page for the listings, to get the total count
response = get('https://washingtondc.craigslist.org/search/hhh?query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
results = html_result.find('div', class_='search-legend')
total = int(results.find('span',class_='totalcount').text)
pages = np.arange(0,total+1,120)
neighborhood = []
bedroom_count =[]
sqft = []
price = []
link = []
for page in pages:
#print(page)
response = get('https://washingtondc.craigslist.org/search/hhh?s='+str(page)+'query=rent&availabilityMode=0&sale_date=all+dates')
html_result = BeautifulSoup(response.text, 'html.parser')
posts = html_result.find_all('li', class_='result-row')
for post in posts:
if post.find('span',class_='result-hood') is not None:
post_url = post.find('a',class_='result-title hdrlnk')
post_link = post_url['href']
link.append(post_link)
post_neighborhood = post.find('span',class_='result-hood').text
post_price = int(post.find('span',class_='result-price').text.strip().replace('$',''))
neighborhood.append(post_neighborhood)
price.append(post_price)
if post.find('span',class_='housing') is not None:
if 'ft2' in post.find('span',class_='housing').text.split()[0]:
post_bedroom = np.nan
post_footage = post.find('span',class_='housing').text.split()[0][:-3]
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
elif len(post.find('span',class_='housing').text.split())>2:
post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
post_footage = post.find('span',class_='housing').text.split()[2][:-3]
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
elif len(post.find('span',class_='housing').text.split())==2:
post_bedroom = post.find('span',class_='housing').text.replace("br","").split()[0]
post_footage = np.nan
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
else:
post_bedroom = np.nan
post_footage = np.nan
bedroom_count.append(post_bedroom)
sqft.append(post_footage)
#create results data frame
post_results = pd.DataFrame({'neighborhood':neighborhood,'footage':sqft,'bedroom':bedroom_count,'price':price,'link':link})
#clean up results
post_results.drop_duplicates(subset='link')
post_results['footage'] = post_results['footage'].replace(0,np.nan)
post_results['bedroom'] = post_results['bedroom'].replace(0,np.nan)
post_results['neighborhood'] = post_results['neighborhood'].str.lstrip('(')
post_results['neighborhood'] = post_results['neighborhood'].str.rstrip(')')
post_results = post_results.dropna(subset=['footage','bedroom'],how='all')
post_results.to_csv("rent_clean.csv",index=False)
print(len(post_results.index))

This problem will happened when you have whitespace in the front
For example :
s=pd.Series([' (xxxx)','(yyyy) '])
s.str.strip('(|)')
0 (xxxx
1 yyyy)
dtype: object
What we can do is strip twice
s.str.strip().str.strip('(|)')
0 xxxx
1 yyyy
dtype: object

From my understanding of your question, you are removing characters from a string. You don't need pandas for this. Strings have a length and you can remove the first and last character like this;
new_word = old_word[1:-1]
This should work for you. Good luck.

Optimize script to flatten json output from API

I have a script that extracts data from an API, where the final output of requests.get(url=url, auth=(user, password)).json() is "all_results". The output is ~25K rows, but it contains nested fields.
The API is for portfolio data, and the children field is a dictionary holding ticker level information (so can be really large).
The script below flattens "all_results" and specifies only the columns I need:
final_df = pd.DataFrame()
for record in all_results:
df = pd.DataFrame(record.get('children', {}))
df['contactId'] = record.get('contactId')
df['origin'] = record.get('origin')
df['description'] = record.get('description')
final_df = final_df.append(df)
It works perfectly with smaller samples, however when trying to run it over the whole data set- it takes HOURS. Can anyone propose something more efficient than my current script? Need it to run way faster than currently.
Thank you in advance!
-- Full script--
user = ''
password= ""
# Starting values
start = 0
rows = 1500
base_url = 'https://....?start={0}&rows={1}'
print ("Connecting to API..")
url = base_url.format(start,rows)
req = requests.get(url=url, auth=(user, password))
print ("Extracting data..")
out = req.json()
total_records = out['other']['numFound']
print("Total records found: "+ str(total_records))
results = out['resultList']
all_results = results
print ("First " + str(rows) + " rows were extracted")
# Results will be an empty list if no more results are found
while results:
start += rows # Rebuild url based on current start
url = base_url.format(start, rows)
req = requests.get(url=url, auth=(user, password))
out = req.json()
results = out['resultList']
all_results += results
print ("Next " + str(rows) + " rows were extracted")
# All results will now contains all the responses of each request.
print("Total records returned from API: "+ str(len(all_results))) #should equal number of records in response
final_df = pd.DataFrame()
for record in all_results:
df = pd.DataFrame(record.get('children', {}))
df['contactId'] = record.get('contactId')
df['origin'] = record.get('origin')
df['description'] = record.get('description')
final_df = final_df.append(df)
final_df = final_df.reset_index()
del final_df['index']
final_df['ticker'] = final_df['identifier'].str.split('#').str.get(0) #extract ticker (anything before #)
final_df.drop_duplicates(keep='first') #removes duplicates
print('DataFrame from API created succesfully\n')
print(final_df.head(n=50))

Python: Using XPath to get data from a table

I'm trying to get data from the table at the bottom of http://projects.fivethirtyeight.com/election-2016/delegate-targets/.
import requests
from lxml import html
url = "http://projects.fivethirtyeight.com/election-2016/delegate-targets/"
response = requests.get(url)
doc = html.fromstring(response.text)
tables = doc.findall('.//table[#class="delegates desktop"]')
election = tables[0]
election_rows = election.findall('.//tr')
def extractCells(row, isHeader=False):
if isHeader:
cells = row.findall('.//th')
else:
cells = row.findall('.//td')
return [val.text_content() for val in cells]
import pandas
def parse_options_data(table):
rows = table.findall(".//tr")
header = extractCells(rows[1], isHeader=True)
data = [extractCells(row, isHeader=False) for row in rows[2:]]
return pandas.DataFrame(data, columns=header)
election_data = parse_options_data(election)
election_data
I'm having trouble with the topmost row with the candidates' names ('Trump', 'Cruz', 'Kasich'). It is under tr class="top" and right now I only have tr class="bottom" (starting with the row that says "won/target").
Any help is much appreciated!

The candidate names are in the 0-th row:
candidates = [val.text_content() for val in rows[0].findall('.//th')[1:]]
Or, if reusing the same extractCells() function:
candidates = extractCells(rows[0], isHeader=True)[1:]
[1:] slice here is to skip the first empty th cell.

Not good ( hard-coded ), but run as u want to.
def parse_options_data(table):
rows = table.findall(".//tr")
candidate = extractCells(rows[0], isHeader=True)[1:]
header = extractCells(rows[1], isHeader=True)[:3] + candidate
data = [extractCells(row, isHeader=False) for row in rows[2:]]
return pandas.DataFrame(data, columns=header)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

LOOP When scraping data - python

value = 1 key = 'a' while value <= 1000: url = .....%(key, str(value)) .... .... value += 1 ...... Use a counter

Related

ValueError: setting an array element with a sequence. For pandas.concat

Kraken API trades endpoint flawed data

python pandas remove character

Optimize script to flatten json output from API

Python: Using XPath to get data from a table

Categories

Resources