I have a python data structure like this
dl= [{'plat': 'unix', 'val':['', '', '1ju', '', '', '202', '', '']},
{'plat': 'Ios', 'val':['', '', '', '', 'Ty', '', 'Jk', '']},
{'plat': 'NT', 'val':['', '', 1, '', '' , '202', '', '']},
{'plat': 'centOs', 'val':['', '', '', '', '', '202', '', '']},
{'plat': 'ubuntu', 'val':['', 'KL', '1', '', '', '', '', '9i0']}]
^ ^
| |
\ /
Delete these
I am trying to delete the position in the list 'val' where the values in the same column in each list are empty. For example, position 0 and 3 in the list(dl). I am trying to get an output like this:
Output= [{'plat': 'unix', 'val':['', '1ju', '', '202', '', '']},
{'plat': 'Ios', 'val':['', '', 'Ty', '', 'Jk', '']},
{'plat': 'NT', 'val':['', 1, '' , '202', '', '']},
{'plat': 'centOs', 'val':['', '', '', '202', '', '']},
{'plat': 'ubuntu', 'val':['KL', '1', '', '', '', '9i0']}]
Let's do this in two steps. First, find indices to remove:
lists = [e['val'] for e in dl]
idx_to_remove = [i for i, elem in enumerate(map(any, zip(*lists))) if not elem]
Second, let's filter original lists:
for l in lists:
l[:] = [elem for i, elem in enumerate(l) if i not in idx_to_remove]
Result:
>>> pprint.pprint(dl)
[{'plat': 'unix', 'val': ['', '1ju', '', '202', '', '']},
{'plat': 'Ios', 'val': ['', '', 'Ty', '', 'Jk', '']},
{'plat': 'NT', 'val': ['', 1, '', '202', '', '']},
{'plat': 'centOs', 'val': ['', '', '', '202', '', '']},
{'plat': 'ubuntu', 'val': ['KL', '1', '', '', '', '9i0']}]
dl= [{'plat': 'unix', 'val':['', '', '1ju', '', '', '202', '', '']},
{'plat': 'Ios', 'val':['', '', '', '', 'Ty', '', 'Jk', '']},
{'plat': 'NT', 'val':['', '', 1, '', '' , '202', '', '']},
{'plat': 'centOs', 'val':['', '', '', '', '', '202', '', '']},
{'plat': 'ubuntu', 'val':['', 'KL','1', '', '', '', '', '9i0']}]
def empty_indices(lst):
return {i for i,v in enumerate(lst) if not v}
# Need to special-case the first one to initialize the set of "emtpy" indices.
remove_idx = empty_indices(dl[0]['val'])
# Here we do the first one twice. We could use itertools.islice but it's
# probably not worth the miniscule speedup.
for item in dl:
remove_idx &= empty_indices(item['val'])
for item in dl:
item['val'] = [k for i,k in enumerate(item['val']) if i not in remove_idx]
# print the results.
import pprint
pprint.pprint(dl)
Yet another possible solution (not really efficient but well...). zip() is really underrated...
# extract the values as a list of list
vals = [item["val"] for item in dl]
# transpose lines to columns
cols = map(list, zip(*lines))
# filter out empty columns
cols = [c for c in cols if filter(None, c)]
# retranspose columns to lines
lines = map(list, zip(*cols))
# build the new dict
output = [
dict(plat=item["plat"], val=line) for item, line in zip(dl, lines)
]
from itertools import izip
from operator import itemgetter
# create an iterator over columns
columns = izip(*(d['val'] for d in dl))
# make function keeps non-empty columns
keepfunc = itemgetter(*(i for i, c in enumerate(columns) if any(c)))
# apply function to each list
for d in dl:
d['val'] = list(keepfunc(d['val']))
Related
I am trying to scrape runner names and number of tips from this page: https://www.horseracing.net/racecards/newmarket/13-05-21
It is only returning the last runner name in the final race. I've been over and over it but can't see what I have done wrong.
Can anyone see the issue?
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/newmarket/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
print(runner_name, tips_no)
Try print(runner, tips) instead of print(runner_name, tips_no):
Output:
print(runner, tips)
# ['Babindi', 'Turandot', 'Sharla', "Serena's Queen", 'Bellazada', 'Baby Alya', 'Adelita', 'Florence Street', 'Allerby', 'Puy Mary', 'Roman Mist', 'Lunar Shadow', 'Breakfastatiffanys', 'General Panic', 'Gidwa', 'Point Lynas', 'Three Dons', 'Wrought Iron', 'Desert Dreamer', 'Adatorio', 'Showmedemoney', 'The Charmer',
# 'Bascinet', 'Dashing Rat', 'Appellation', 'Cambridgeshire', 'Danni California', 'Drifting Sands', 'Lunar Gold', 'Malathaat', 'Miss Calacatta', 'Sunrise Valley', 'Sweet Expectation', 'White Lady', 'Riknnah', 'Aaddeey', 'High Commissioner', 'Kaloor', 'Rodrigo Diaz', 'Mukha Magic', 'Gauntlet', 'Hawridge Flyer', 'Clovis Point', 'Franco Grasso', 'Kemari', 'Magical Land', 'Mobarhin', 'Movin Time', 'Night Of Dreams', 'Punta Arenas', 'Constanta', 'Cosmic George', 'Taravara', 'Basilicata', 'Top Brass', 'Without Revenge', 'Grand Scheme', 'Easy Equation', 'Mr Excellency', 'Colonel Faulkner', 'Urban War', 'Freak Out', 'Alabama Boy', 'Anghaam', 'Arqoob', 'Fiordland', 'Dickens', "Shuv H'Penny King"]
# ['5', '3', '1', '3', '1', '', '1', '', '', '', '1', '', '', '', '', '1', '', '', '12', '1', '', '', '', '', '', '', '5', '', '1', '', '', '7', '', '', '1', '11', '1', '', '', '', '', '2', '', '', '1', '3', '2', '9', '', '', '', '', '5', '1', '4', '', '5', '', '1', '4', '2', '1', '3', '2', '1', '', '', '']
I have a PDF that contains many tables. )I converted this PDF to CSV online, to extract the needed data more easily.)
The CSV rows are composed of many columns, but each table contains only 3 columns, so it is hard to know which column refers to a cell.
I also should mention that one cell can be composed of more than one line and column.
An example of a table.
So is there any solution to extract these cells?
import csv
import re
class PDF_EXTRACTOR:
FILE_NAME=None
Ttableau=None
NUMBER_OF_PAGES=None
def __init__(self,fn):
self.FILE_NAME=fn
self.Ttableau=0
self.NUMBER_OF_PAGES=0
def EXTRACT_CELLULE(self):
csv.register_dialect('mydialect',delimiter =',',skipinitialspace=True)
print(csv.list_dialects())
with open(self.FILE_NAME,'r',encoding='utf8',errors='ignore') as csvFile:
reader = csv.reader(csvFile, dialect='mydialect')
for index, row in enumerate(reader):
print(row)
I expected an output like this:
["Region 1","Region 2", "Region3]
["8,3-9","AUXILIAIRES DE LA MÉTÉOROLOGIE 5.54A 5.54B 5.54C",""]
["70-72","70-90","70-72"]
["RADIONAVIGATION 5.60","FIXE","MARITIME 5.60"]
But instead I got this:
['7 450-8 100', 'FIXE', '', '', '', '']
['', 'MOBILE sauf mobile aéronautique (R)', '', '', '', '']
['', '5.144', '', '', '', '']
['8 100-8 195', 'FIXE', '', '', '', '']
['', 'MOBILE MARITIME', '', '', '', '']
['8 195-8 815', 'MOBILE MARITIME', '5.109', '5.11', '5.132', '5.145']
['', '5.111', '', '', '', '']
['8 815-8 965', 'MOBILE AÉRONAUTIQUE (R)', '', '', '', '']
['8 965-9 040', 'MOBILE AÉRONAUTIQUE (OR)', '', '', '', '']
['9 040-9 305', '9 040-9 400', '', '', '', '9 040-9 305']
['FIXE', 'FIXE', '', '', '', 'FIXE']
['9 305-9 355', '', '', '', '', '9 305-9 355']
['FIXE', '', '', '', '', 'FIXE']
['Radiolocalisation 5.145A', '', '', '', '', 'Radiolocalisation 5.145A']
['5.145B', '', '', '', '', '']
['9 355-9 400', '', '', '', '', '9 355-9 400']
I am trying to implement the following with loading an internal data structure to pandas:
df = pd.DataFrame(self.data,
nrows=num_rows+500,
skiprows=skip_rows,
header=header_row,
usecols=limit_cols)
However, it doesn't appear to implement any of those (like it does when reading a csv file, other than the data). Is there another method I can use to have more control over the data that I'm ingesting? Or, do I need to rebuild the data before loading it into pandas?
My input data looks like this:
data = [
['ABC', 'es-419', 'US', 'Movie', 'Full Extract', 'PARIAH', '', '', 'EST', 'Features - EST', 'HD', '2017-05-12 00:00:00', 'Open', 'WSP', '10.5000', '', '', '', '', '10.5240/8847-7152-6775-8B59-ADE0-Y', '10.5240/FFE3-D036-A9A4-9E7A-D833-1', '', '', '', '04065', '', '', '2011', '', '', '', '', '', '', '', '', '', '', '', '113811', '', '', '', '', '', '04065', '', 'Spanish (LAS)', 'US', '10', 'USA NATL SALE', '2017-05-11 00:00:00', 'TIER 3', '21', '', '', 'USA NATL SALE-SPANISH LANGUAGE', 'SPAN'],
['ABC', 'es-419', 'US', 'Movie', 'Full Extract', 'PATCH ADAMS', '', '', 'EST', 'Features - EST', 'HD', '2017-05-12 00:00:00', 'Open', 'WSP', '10.5000', '', '', '', '', '10.5240/DD84-FBF4-8F67-D6F3-47FF-1', '10.5240/B091-00D4-8215-39D8-0F33-8', '', '', '', 'U2254', '', '', '1998', '', '', '', '', '', '', '', '', '', '', '', '113811', '', '', '', '', '', 'U2254', '', 'Spanish (LAS)', 'US', '10', 'USA NATL SALE', '2017-05-11 00:00:00', 'TIER 3', '21', '', '', 'USA NATL SALE-SPANISH LANGUAGE', 'SPAN']
]
And so I'm looking to be able to state which rows it should load (or skip) and which columns it should skip (usecols). Is that possible to do with an internal python data structure?
I I have the following list of lists:
(['investmentseminar', '300', '', '', 'CNAME', '', 'domain.com.'], 7)
(['#', '300', '', '', '', '', '', '', '', 'CNAME', '', 'domain.com.'], 12)
(['#', '300', '', '', '', '', '', '', '', '', '', '', '', '', '', 'MX', '', '1', '', 'eu-smtp-inbound-1.com.'], 20)
(['#', '3600', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'TXT', '', 'MS=ms87183849'], 19)
(['#', '3600', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'TXT', '', 'MS=ms91398333'], 19)
it is from a parsed file with BIND data, i am trying to extract the record type and TTL, where the position of the items in the list are fixed.
this is the code i have so far:
lines = [['#', '', '', 'MX', '', '10', '', 'relay1.netnames.net.'],['#', '', '', 'MX', '', '20', '', 'relay2.netnames.net.'], ['#', '3600', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'TXT', '', 'MS=ms91398333'], ['#', '300', '', '', '', '', '', '', '', '', '', '', '', '', '', 'MX', '', '1', '', 'eu-smtp-inbound-1.com.'], ['domain.tld.', '3600', '', '', '', '', '', '', '', '', '', '', '', 'TXT', '', 'v=spf1 redirect=spf.domain.tld'],['a.ns.slf', '', '', '', '', '', '', '', '', '', 'A', '', '192.123.54.133'],['adfs', '', '', '', '', '', '', '', '', '', '', '', '', '', 'A', '', '192.123.67.20']]
record_set_list = []
def record_set(record):
resource = {
'Name': record[0],
'TTL': record[1],
'Type': record[4],
'Value': record[-1]
}
record_set_list.append({'RecordSets': resource})
types = ['A', 'AAAA', 'CAA', 'CNAME', 'MX', 'NAPTR', 'PTR', 'SPF', 'SRV', 'TXT', 'ZONE']
for record in csv.reader(lines, delimiter=" "):
any_in = any(i in record for i in types)
if any_in is True:
record_set(record)
how do i match the TTL, Type and in the case of MX record the preference?
any advise is much appreciated
Use the builtin function filter to remove the empty strings, zip the remaining values with the corresponding keys, and make a dict.
def record_set(record):
keys = ['Name', 'TTL', 'Type', 'Value']
values = filter(None, record)
resource = dict(zip(keys, values))
record_set_list.append({'RecordSets': resource})
import csv
import requests
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv')
reader=csv.reader(webpage)
for row in reader:
print(row)
Hi, I'm new to Python and I'm trying to open a CSV file from a URL & then display the rows so I can take the data that I need from it. However, the I get an error saying :
Traceback (most recent call last):
File "", line 1, in
for row in reader: Error: iterator should return strings, not bytes (did you open the file in text mode?)
Thank you in advance.
You can try this:
import csv, requests
webpage=requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv')
reader = csv.reader(webpage.content.splitlines())
for row in reader:
print(row)
Hope this will help
Use .text as you are getting bytes returned in python3:
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv')
reader = csv.reader([webpage.text])
for row in reader:
print(row)
That gives _csv.Error: new-line character seen in unquoted field so split the lines after decoding, also stream=True will allow you to get the data in chunks not all at once so you can filter by row and write:
import csv
import requests
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv', stream=1)
for line in webpage:
print(list(csv.reader((line.decode("utf-8")).splitlines()))[0])
Which gives you:
['Day Ahead Hourly LMP Values for 20160427', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['00', '600', '700', '800', '900', '1000', '1100', '1200', '1300', '1400', '1500', '1600', '1700', '1800', '1900', '2000', '2100', '2200', '2300', '2400', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['1', '25.13', '25.03', '28.66', '25.94', '21.74', '19.47', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['600', '600', '600', '700', '700', '700', '800', '800', '800', '900', '900', '900', '1000', '1000', '1000', '1100', '1100', '1100', '1200', '1200', '1200', '1300', '1300', '1300', '1400', '1400', '1400', '1500', '']
['1500', '1500', '1600', '1600', '1600', '1700', '1700', '1700', '1800', '1800', '1800', '1900', '1900', '1900']
['', '2000', '2000', '2000', '2100', '2100', '2100', '2200', '2200', '2200', '2300', '2300', '2300', '2400', '2400', '2400', '']
['lLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'Tot']
['alLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'To']
['talLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'T']
.......................................
A variation on the answer by Padriac Cunningham uses iter_lines() from Requests and decodes each line using a list comprehension
import csv
import requests
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv', stream = True)
webpage_decoded = [line.decode('utf-8') for line in webpage.iter_lines()]
reader = csv.reader(webpage_decoded)
or even simpler, you can have iter_lines() do the decoding
webpage_decoded = webpage.iter_lines(decode_unicode=True)