Related
I'm trying to get a list of clinicaltrials.gov URLs along with a brief description of a project. My program will obtain the NCT numbers, which are also added to a base URL to make the full clinical trial profile page. For example, "NCT00471926" added to the end of "https://clinicaltrials.gov/ct2/show/" will take you to a full profile from which the brief description is obtained. This all works fine, except when I have more than 100 results. In the URL I provide below, there are 216 results. I can scrape the first 100 with my script, but that's it. Alternatively, I could download all 216 results into a CSV file and import each NCT, but that's not practical to do each week. Is there a way I can (1) find a way to get all 216 results through web scraping or (2) write code to download my results? For #2, it looks like Javascript is used, so I can't do this (to my knowledge) with Beautiful Soup. Thanks in advance.
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import lxml.html
out = []
allncturls = []
# get nct numbers first and put into list
ncturls = ['https://clinicaltrials.gov/ct2/results?cond=&term=diabetes+quality+improvement&cntry=&state=&city=&dist=']
for ncturl in ncturls:
response = requests.get(ncturl)
soup = BeautifulSoup(response.content, 'html.parser')
for v in soup.find_all("a", href=re.compile('ct2/show/NCT')):
ext_url = str(v).split()
for z in ext_url:
sub = 'NCT'
z =("\n".join(s for s in ext_url if sub in s))
z = z.split(':')[0]
z_length = len(z)
nct_number = z.split()[1]
nct_url = 'https://clinicaltrials.gov/ct2/show/'+nct_number
allncturls.append(nct_url)
urls = allncturls
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
abstract = re.sub('\n+|\xa0','',soup.select_one('.ct-body3:contains("Brief Summary:") + div').text.strip()) if abstract == '' else abstract
data = {'url': url, 'abstract': abstract}
out.append(data)
df = pd.DataFrame(out)
df.to_excel('clinicaltrialstresults.xlsx')
The following gets all the links and shows a structure for making requests to all links. The data for the initial links (the first 3 requests have the 216 items basic info stored in json in turn stored in json_items; for later access. In the loop over all urls to get detailed summary you need to determine how you are going to pull all info together. I have not run for more than one request. Requires bs4 4.7.1+ though I can easily alter the css selector if that is required.
import requests, math
from bs4 import BeautifulSoup as bs
import time
headers = {
'User-Agent': 'Mozilla/5.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://clinicaltrials.gov/ct2/results?cond=&term=diabetes+quality+improvement&cntry=&state=&city=&dist=',
'X-Requested-With': 'XMLHttpRequest'
}
data = {
'draw': '3',
'columns[0][data]': '0',
'columns[0][name]': '',
'columns[0][searchable]': 'true',
'columns[0][orderable]': 'false',
'columns[0][search][value]': '',
'columns[0][search][regex]': 'false',
'columns[1][data]': '1',
'columns[1][name]': '',
'columns[1][searchable]': 'false',
'columns[1][orderable]': 'false',
'columns[1][search][value]': '',
'columns[1][search][regex]': 'false',
'columns[2][data]': '2',
'columns[2][name]': '',
'columns[2][searchable]': 'true',
'columns[2][orderable]': 'false',
'columns[2][search][value]': '',
'columns[2][search][regex]': 'false',
'columns[3][data]': '3',
'columns[3][name]': '',
'columns[3][searchable]': 'true',
'columns[3][orderable]': 'false',
'columns[3][search][value]': '',
'columns[3][search][regex]': 'false',
'columns[4][data]': '4',
'columns[4][name]': '',
'columns[4][searchable]': 'true',
'columns[4][orderable]': 'false',
'columns[4][search][value]': '',
'columns[4][search][regex]': 'false',
'columns[5][data]': '5',
'columns[5][name]': '',
'columns[5][searchable]': 'true',
'columns[5][orderable]': 'false',
'columns[5][search][value]': '',
'columns[5][search][regex]': 'false',
'columns[6][data]': '6',
'columns[6][name]': '',
'columns[6][searchable]': 'true',
'columns[6][orderable]': 'false',
'columns[6][search][value]': '',
'columns[6][search][regex]': 'false',
'columns[7][data]': '7',
'columns[7][name]': '',
'columns[7][searchable]': 'true',
'columns[7][orderable]': 'false',
'columns[7][search][value]': '',
'columns[7][search][regex]': 'false',
'columns[8][data]': '8',
'columns[8][name]': '',
'columns[8][searchable]': 'true',
'columns[8][orderable]': 'false',
'columns[8][search][value]': '',
'columns[8][search][regex]': 'false',
'columns[9][data]': '9',
'columns[9][name]': '',
'columns[9][searchable]': 'true',
'columns[9][orderable]': 'false',
'columns[9][search][value]': '',
'columns[9][search][regex]': 'false',
'columns[10][data]': '10',
'columns[10][name]': '',
'columns[10][searchable]': 'true',
'columns[10][orderable]': 'false',
'columns[10][search][value]': '',
'columns[10][search][regex]': 'false',
'columns[11][data]': '11',
'columns[11][name]': '',
'columns[11][searchable]': 'true',
'columns[11][orderable]': 'false',
'columns[11][search][value]': '',
'columns[11][search][regex]': 'false',
'columns[12][data]': '12',
'columns[12][name]': '',
'columns[12][searchable]': 'true',
'columns[12][orderable]': 'false',
'columns[12][search][value]': '',
'columns[12][search][regex]': 'false',
'columns[13][data]': '13',
'columns[13][name]': '',
'columns[13][searchable]': 'true',
'columns[13][orderable]': 'false',
'columns[13][search][value]': '',
'columns[13][search][regex]': 'false',
'columns[14][data]': '14',
'columns[14][name]': '',
'columns[14][searchable]': 'true',
'columns[14][orderable]': 'false',
'columns[14][search][value]': '',
'columns[14][search][regex]': 'false',
'columns[15][data]': '15',
'columns[15][name]': '',
'columns[15][searchable]': 'true',
'columns[15][orderable]': 'false',
'columns[15][search][value]': '',
'columns[15][search][regex]': 'false',
'columns[16][data]': '16',
'columns[16][name]': '',
'columns[16][searchable]': 'true',
'columns[16][orderable]': 'false',
'columns[16][search][value]': '',
'columns[16][search][regex]': 'false',
'columns[17][data]': '17',
'columns[17][name]': '',
'columns[17][searchable]': 'true',
'columns[17][orderable]': 'false',
'columns[17][search][value]': '',
'columns[17][search][regex]': 'false',
'columns[18][data]': '18',
'columns[18][name]': '',
'columns[18][searchable]': 'true',
'columns[18][orderable]': 'false',
'columns[18][search][value]': '',
'columns[18][search][regex]': 'false',
'columns[19][data]': '19',
'columns[19][name]': '',
'columns[19][searchable]': 'true',
'columns[19][orderable]': 'false',
'columns[19][search][value]': '',
'columns[19][search][regex]': 'false',
'columns[20][data]': '20',
'columns[20][name]': '',
'columns[20][searchable]': 'true',
'columns[20][orderable]': 'false',
'columns[20][search][value]': '',
'columns[20][search][regex]': 'false',
'columns[21][data]': '21',
'columns[21][name]': '',
'columns[21][searchable]': 'true',
'columns[21][orderable]': 'false',
'columns[21][search][value]': '',
'columns[21][search][regex]': 'false',
'columns[22][data]': '22',
'columns[22][name]': '',
'columns[22][searchable]': 'true',
'columns[22][orderable]': 'false',
'columns[22][search][value]': '',
'columns[22][search][regex]': 'false',
'columns[23][data]': '23',
'columns[23][name]': '',
'columns[23][searchable]': 'true',
'columns[23][orderable]': 'false',
'columns[23][search][value]': '',
'columns[23][search][regex]': 'false',
'columns[24][data]': '24',
'columns[24][name]': '',
'columns[24][searchable]': 'true',
'columns[24][orderable]': 'false',
'columns[24][search][value]': '',
'columns[24][search][regex]': 'false',
'columns[25][data]': '25',
'columns[25][name]': '',
'columns[25][searchable]': 'true',
'columns[25][orderable]': 'false',
'columns[25][search][value]': '',
'columns[25][search][regex]': 'false',
'start': '0',
'length': '100',
'search[value]': '',
'search[regex]': 'false'
}
json_items = {}
urls = []
with requests.Session() as s:
r = s.post('https://clinicaltrials.gov/ct2/results/rpc/5i0yqihHSdCL5Q7Gp61PzwS3ai7GvQ1PxnhzmwoyZiNHm67xW', headers=headers, data=data).json()
json_items[1] = r
num_results = int(r['recordsFiltered'])
urls += [f'https://clinicaltrials.gov/ct2/show/{i[1]}' for i in r['data']]
num_pages = math.ceil(num_results/100)
for page in range(2, num_pages + 1):
data['start'] = str(int(data['start'])+100)
r = s.post('https://clinicaltrials.gov/ct2/results/rpc/5i0yqihHSdCL5Q7Gp61PzwS3ai7GvQ1PxnhzmwoyZiNHm67xW', headers=headers, data=data).json()
json_items[page] = r #store all json in case wanted
urls += [f'https://clinicaltrials.gov/ct2/show/{i[1]}' for i in r['data']]
for count, url in enumerate(urls):
if count % 10 == 0: #some form of pause x number of requests
time.sleep(2)
r = s.get(url)
soup = bs(r.content, 'lxml')
study = soup.select_one('h1').text
detailed_desc = soup.select_one('.ct-body3:has(#detaileddesc) + .tr-indent2').text
#do something with detailed desc etc
break #delete me
I am trying to implement the following with loading an internal data structure to pandas:
df = pd.DataFrame(self.data,
nrows=num_rows+500,
skiprows=skip_rows,
header=header_row,
usecols=limit_cols)
However, it doesn't appear to implement any of those (like it does when reading a csv file, other than the data). Is there another method I can use to have more control over the data that I'm ingesting? Or, do I need to rebuild the data before loading it into pandas?
My input data looks like this:
data = [
['ABC', 'es-419', 'US', 'Movie', 'Full Extract', 'PARIAH', '', '', 'EST', 'Features - EST', 'HD', '2017-05-12 00:00:00', 'Open', 'WSP', '10.5000', '', '', '', '', '10.5240/8847-7152-6775-8B59-ADE0-Y', '10.5240/FFE3-D036-A9A4-9E7A-D833-1', '', '', '', '04065', '', '', '2011', '', '', '', '', '', '', '', '', '', '', '', '113811', '', '', '', '', '', '04065', '', 'Spanish (LAS)', 'US', '10', 'USA NATL SALE', '2017-05-11 00:00:00', 'TIER 3', '21', '', '', 'USA NATL SALE-SPANISH LANGUAGE', 'SPAN'],
['ABC', 'es-419', 'US', 'Movie', 'Full Extract', 'PATCH ADAMS', '', '', 'EST', 'Features - EST', 'HD', '2017-05-12 00:00:00', 'Open', 'WSP', '10.5000', '', '', '', '', '10.5240/DD84-FBF4-8F67-D6F3-47FF-1', '10.5240/B091-00D4-8215-39D8-0F33-8', '', '', '', 'U2254', '', '', '1998', '', '', '', '', '', '', '', '', '', '', '', '113811', '', '', '', '', '', 'U2254', '', 'Spanish (LAS)', 'US', '10', 'USA NATL SALE', '2017-05-11 00:00:00', 'TIER 3', '21', '', '', 'USA NATL SALE-SPANISH LANGUAGE', 'SPAN']
]
And so I'm looking to be able to state which rows it should load (or skip) and which columns it should skip (usecols). Is that possible to do with an internal python data structure?
Hey guys i´m traying to get data from API stock market.
import zmq
context = zmq.Context()
socket = context.socket(zmq.SUB)
print("Recibo mensajes del servidor...")
socket.connect("tcp://XXXXXXXXXXXXXXXXXX")
socket.setsockopt_string(zmq.SUBSCRIBE, u'')
while True:
JSON = socket.recv_json()
print(JSON)
And recive data like this:
{'X': {'MDReqId': 'HUB22_1533207696768', 'MDIncGrp': [{'offer': {'OrderID':
'', 'SettlType': '3', 'MDEntrySeller': '', 'Precio': '435',
'MDEntryPositionNo': '1', 'SettlDate': '', 'MDEntrySize': 2000.0,
'MDUpdateAction': '2', 'MDEntryTime': '10:37:56', 'Symbol': 'BA.C',
'MDEntryBuyer': '', 'NumberOfOrders': '', 'MDEntryDate': '20180802'}}],
'MDBookType': 2}}
{'X': {'MDReqId': 'HUB22_1533207696768', 'MDIncGrp': [{'bid': {'OrderID': '',
'SettlType': '3', 'MDEntrySeller': '', 'Precio': '410', 'MDEntryPositionNo':
'2', 'SettlDate': '', 'MDEntrySize': 24.0, 'MDUpdateAction': '0',
'MDEntryTime': '10:37:56', 'Symbol': 'BA.C', 'MDEntryBuyer': '200',
'NumberOfOrders': '', 'MDEntryDate': '20180802'}}, {'offer': {'OrderID': '',
'SettlType': '3', 'MDEntrySeller': '046', 'Precio': '450',
'MDEntryPositionNo': '1', 'SettlDate': '', 'MDEntrySize': 2000.0,
'MDUpdateAction': '0', 'MDEntryTime': '10:37:56', 'Symbol': 'BA.C',
'MDEntryBuyer': '200', 'NumberOfOrders': '', 'MDEntryDate': '20180802'}},
{'bid': {'OrderID': '', 'SettlType': '3', 'MDEntrySeller': '046', 'Precio':
'433', 'MDEntryPositionNo': '1', 'SettlDate': '', 'MDEntrySize': 10.0,
'MDUpdateAction': '0', 'MDEntryTime': '10:37:56', 'Symbol': 'BA.C',
'MDEntryBuyer': '262', 'NumberOfOrders': '', 'MDEntryDate': '20180802'}}],
'MDBookType': 3}}
My question is:
How can I convert a JSON data (from API) into python data?
You already have JSON (well, a Python dict but Python folks think they're the same thing)!
Run this experiment:
while True:
json = socket.recv_json()
print(json['X'])
x = json['X'] # now `x` is the "same" as json['X']
print(x['MDReqId']) # this is equivalent to print(json['X']['MDReqId'])
md_req_id = x['MDReqId']
print(md_req_id) # should output `HUB22_1533207696768`
The results you are getting from recv_json are already a built-in Python type (dict). So you could do something like:
JSON['x']
import csv
import requests
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv')
reader=csv.reader(webpage)
for row in reader:
print(row)
Hi, I'm new to Python and I'm trying to open a CSV file from a URL & then display the rows so I can take the data that I need from it. However, the I get an error saying :
Traceback (most recent call last):
File "", line 1, in
for row in reader: Error: iterator should return strings, not bytes (did you open the file in text mode?)
Thank you in advance.
You can try this:
import csv, requests
webpage=requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv')
reader = csv.reader(webpage.content.splitlines())
for row in reader:
print(row)
Hope this will help
Use .text as you are getting bytes returned in python3:
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv')
reader = csv.reader([webpage.text])
for row in reader:
print(row)
That gives _csv.Error: new-line character seen in unquoted field so split the lines after decoding, also stream=True will allow you to get the data in chunks not all at once so you can filter by row and write:
import csv
import requests
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv', stream=1)
for line in webpage:
print(list(csv.reader((line.decode("utf-8")).splitlines()))[0])
Which gives you:
['Day Ahead Hourly LMP Values for 20160427', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['00', '600', '700', '800', '900', '1000', '1100', '1200', '1300', '1400', '1500', '1600', '1700', '1800', '1900', '2000', '2100', '2200', '2300', '2400', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['1', '25.13', '25.03', '28.66', '25.94', '21.74', '19.47', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['600', '600', '600', '700', '700', '700', '800', '800', '800', '900', '900', '900', '1000', '1000', '1000', '1100', '1100', '1100', '1200', '1200', '1200', '1300', '1300', '1300', '1400', '1400', '1400', '1500', '']
['1500', '1500', '1600', '1600', '1600', '1700', '1700', '1700', '1800', '1800', '1800', '1900', '1900', '1900']
['', '2000', '2000', '2000', '2100', '2100', '2100', '2200', '2200', '2200', '2300', '2300', '2300', '2400', '2400', '2400', '']
['lLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'Tot']
['alLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'To']
['talLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'T']
.......................................
A variation on the answer by Padriac Cunningham uses iter_lines() from Requests and decodes each line using a list comprehension
import csv
import requests
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv', stream = True)
webpage_decoded = [line.decode('utf-8') for line in webpage.iter_lines()]
reader = csv.reader(webpage_decoded)
or even simpler, you can have iter_lines() do the decoding
webpage_decoded = webpage.iter_lines(decode_unicode=True)
I have a python data structure like this
dl= [{'plat': 'unix', 'val':['', '', '1ju', '', '', '202', '', '']},
{'plat': 'Ios', 'val':['', '', '', '', 'Ty', '', 'Jk', '']},
{'plat': 'NT', 'val':['', '', 1, '', '' , '202', '', '']},
{'plat': 'centOs', 'val':['', '', '', '', '', '202', '', '']},
{'plat': 'ubuntu', 'val':['', 'KL', '1', '', '', '', '', '9i0']}]
^ ^
| |
\ /
Delete these
I am trying to delete the position in the list 'val' where the values in the same column in each list are empty. For example, position 0 and 3 in the list(dl). I am trying to get an output like this:
Output= [{'plat': 'unix', 'val':['', '1ju', '', '202', '', '']},
{'plat': 'Ios', 'val':['', '', 'Ty', '', 'Jk', '']},
{'plat': 'NT', 'val':['', 1, '' , '202', '', '']},
{'plat': 'centOs', 'val':['', '', '', '202', '', '']},
{'plat': 'ubuntu', 'val':['KL', '1', '', '', '', '9i0']}]
Let's do this in two steps. First, find indices to remove:
lists = [e['val'] for e in dl]
idx_to_remove = [i for i, elem in enumerate(map(any, zip(*lists))) if not elem]
Second, let's filter original lists:
for l in lists:
l[:] = [elem for i, elem in enumerate(l) if i not in idx_to_remove]
Result:
>>> pprint.pprint(dl)
[{'plat': 'unix', 'val': ['', '1ju', '', '202', '', '']},
{'plat': 'Ios', 'val': ['', '', 'Ty', '', 'Jk', '']},
{'plat': 'NT', 'val': ['', 1, '', '202', '', '']},
{'plat': 'centOs', 'val': ['', '', '', '202', '', '']},
{'plat': 'ubuntu', 'val': ['KL', '1', '', '', '', '9i0']}]
dl= [{'plat': 'unix', 'val':['', '', '1ju', '', '', '202', '', '']},
{'plat': 'Ios', 'val':['', '', '', '', 'Ty', '', 'Jk', '']},
{'plat': 'NT', 'val':['', '', 1, '', '' , '202', '', '']},
{'plat': 'centOs', 'val':['', '', '', '', '', '202', '', '']},
{'plat': 'ubuntu', 'val':['', 'KL','1', '', '', '', '', '9i0']}]
def empty_indices(lst):
return {i for i,v in enumerate(lst) if not v}
# Need to special-case the first one to initialize the set of "emtpy" indices.
remove_idx = empty_indices(dl[0]['val'])
# Here we do the first one twice. We could use itertools.islice but it's
# probably not worth the miniscule speedup.
for item in dl:
remove_idx &= empty_indices(item['val'])
for item in dl:
item['val'] = [k for i,k in enumerate(item['val']) if i not in remove_idx]
# print the results.
import pprint
pprint.pprint(dl)
Yet another possible solution (not really efficient but well...). zip() is really underrated...
# extract the values as a list of list
vals = [item["val"] for item in dl]
# transpose lines to columns
cols = map(list, zip(*lines))
# filter out empty columns
cols = [c for c in cols if filter(None, c)]
# retranspose columns to lines
lines = map(list, zip(*cols))
# build the new dict
output = [
dict(plat=item["plat"], val=line) for item, line in zip(dl, lines)
]
from itertools import izip
from operator import itemgetter
# create an iterator over columns
columns = izip(*(d['val'] for d in dl))
# make function keeps non-empty columns
keepfunc = itemgetter(*(i for i, c in enumerate(columns) if any(c)))
# apply function to each list
for d in dl:
d['val'] = list(keepfunc(d['val']))