formatting dictionary keys read from pdf - python

how can I convert this dictionary keys to the following
original_di={'001': '', '002': '', '3': '24s', '004': '42s', '5': '', '006': '', '007': '', '008': '', '009': '', '010': '', '011': '', '012\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '013': '', '014': '', '015': '', '016': '', '017': '', '018': '', '019': '', '020': '', '021': '', '022': '', '023': '', '024': '', '025': '', '026': '', '027': '', '028': '', '029': '', '030': '', '031': '', '032': '', '033': '', '041': '', '042': '', '043': '', '044': '', '045': '', '046': '', '047': '', '048': '', '049': '', '050': '', '051': '', '052': '', '053': '', '054': '', '055': '', '056\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '035': '', '037': '', '039\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '034': '', '036': '', '038': '', '040\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '057': '', '092': '', '058': '', '059': '', '060': '', '061': '', '062': '', '063\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '064\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '065\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '066': '', '067': '', '068': '', '069': '', '070': '', '071': '', '072': '', '073': '', '074': '', '075': '', '076': '', '077': '', '078': '', '079': '', '080': '', '081': '', '082': '', '083': '', '084': '', '085\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '086': '', '087': '', '088': '', '089\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '090': '', '091': '', '093': '', '094': '', '095': '', '096': '', '097': '', '098': '', '099': '', '100': '', '101': '', '102': '', '103': '', '104': '', '105': '', '106': '', '107': '', '108': '', '109': '', '110': '', '111': '', '112': '', '113': '', '114': '', '115': '', '116': '', '117': '', '118': '', '119': '', '120': '', '121': '', '122': '', '123': '', '124': '', '125': '', '126': '', '127': '', '128': '', '129': '', '130': '', '131': '', '132': '', '133': '', '134': '', '135': '', '136': '', '137': '', '138': '', '139': '', '140': '', '141': '', '142': '', '143': '', '144': '', '145': '87e', '146': '', '147': '', '148': '', '149\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '150\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '151\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '152\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '153\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '154\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '155\r\r\r\r\r\r\r\r\r\r\r\r\r': 'US', '156': ''}
some keys have extra \r or \t and some have keys which aren't 3 digits.
Ideally, the output I want is for all keys to be 3 digits 001, 003,050, 111 (without \r\t)

try this, strip to remove new-line characters & rjust to fill in the values
{k.strip().rjust(3, "0"): v.strip() for k, v in original_di.items()}

for k, v in original_di.items() - Iterate on dict and k contains the keys and v contains the values.
int(k.strip()) - Removing the new-line characters (Eg.: \n or \t) from key and casting to integer the string.
"{0:0=3d}".format(x) - Create a string which contains 3 digits in every case from your integer
: v.strip() - Removing the new-line characters (Eg.: \n or \t) from value.
Code:
original_di={'001': '', '002': '', '3': '24s', '004': '42s', '5': '', '006': '', '007': '', '008': '', '009': '', '010': '', '011': '', '012\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '013': '', '014': '', '015': '', '016': '', '017': '', '018': '', '019': '', '020': '', '021': '', '022': '', '023': '', '024': '', '025': '', '026': '', '027': '', '028': '', '029': '', '030': '', '031': '', '032': '', '033': '', '041': '', '042': '', '043': '', '044': '', '045': '', '046': '', '047': '', '048': '', '049': '', '050': '', '051': '', '052': '', '053': '', '054': '', '055': '', '056\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '035': '', '037': '', '039\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '034': '', '036': '', '038': '', '040\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '057': '', '092': '', '058': '', '059': '', '060': '', '061': '', '062': '', '063\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '064\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '065\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '066': '', '067': '', '068': '', '069': '', '070': '', '071': '', '072': '', '073': '', '074': '', '075': '', '076': '', '077': '', '078': '', '079': '', '080': '', '081': '', '082': '', '083': '', '084': '', '085\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '086': '', '087': '', '088': '', '089\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '090': '', '091': '', '093': '', '094': '', '095': '', '096': '', '097': '', '098': '', '099': '', '100': '', '101': '', '102': '', '103': '', '104': '', '105': '', '106': '', '107': '', '108': '', '109': '', '110': '', '111': '', '112': '', '113': '', '114': '', '115': '', '116': '', '117': '', '118': '', '119': '', '120': '', '121': '', '122': '', '123': '', '124': '', '125': '', '126': '', '127': '', '128': '', '129': '', '130': '', '131': '', '132': '', '133': '', '134': '', '135': '', '136': '', '137': '', '138': '', '139': '', '140': '', '141': '', '142': '', '143': '', '144': '', '145': '87e', '146': '', '147': '', '148': '', '149\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '150\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '151\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '152\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '153\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '154\r\r\r\r\r\r\r\r\r\r\r\r\r': '', '155\r\r\r\r\r\r\r\r\r\r\r\r\r': 'US', '156': ''}
print("{}".format({"{0:0=3d}".format(int(k.strip())): v.strip() for k, v in original_di.items()}))
Output:
>>> python3 test.py
{'001': '', '002': '', '003': '24s', '004': '42s', '005': '', '006': '', '007': '', '008': '', '009': '', '010': '', '011': '', '012': '', '013': '', '014': '', '015': '', '016': '', '017': '', '018': '', '019': '', '020': '', '021': '', '022': '', '023': '', '024': '', '025': '', '026': '', '027': '', '028': '', '029': '', '030': '', '031': '', '032': '', '033': '', '041': '', '042': '', '043': '', '044': '', '045': '', '046': '', '047': '', '048': '', '049': '', '050': '', '051': '', '052': '', '053': '', '054': '', '055': '', '056': '', '035': '', '037': '', '039': '', '034': '', '036': '', '038': '', '040': '', '057': '', '092': '', '058': '', '059': '', '060': '', '061': '', '062': '', '063': '', '064': '', '065': '', '066': '', '067': '', '068': '', '069': '', '070': '', '071': '', '072': '', '073': '', '074': '', '075': '', '076': '', '077': '', '078': '', '079': '', '080': '', '081': '', '082': '', '083': '', '084': '', '085': '', '086': '', '087': '', '088': '', '089': '', '090': '', '091': '', '093': '', '094': '', '095': '', '096': '', '097': '', '098': '', '099': '', '100': '', '101': '', '102': '', '103': '', '104': '', '105': '', '106': '', '107': '', '108': '', '109': '', '110': '', '111': '', '112': '', '113': '', '114': '', '115': '', '116': '', '117': '', '118': '', '119': '', '120': '', '121': '', '122': '', '123': '', '124': '', '125': '', '126': '', '127': '', '128': '', '129': '', '130': '', '131': '', '132': '', '133': '', '134': '', '135': '', '136': '', '137': '', '138': '', '139': '', '140': '', '141': '', '142': '', '143': '', '144': '', '145': '87e', '146': '', '147': '', '148': '', '149': '', '150': '', '151': '', '152': '', '153': '', '154': '', '155': 'US', '156': ''}

Related

Is there a way to iterate through a list of lists without getting an index error? [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
This question does not appear to be about programming within the scope defined in the help center.
Closed 6 months ago.
Improve this question
I have a list of lists and I am trying to pull out every nth term from each list within the list.
Here is my input:
[['', '', '', '', '1', '', '', '', '', '', '', '', '1TD1131D17025-2035', '', '', '',
'', '', '', '', '', '', '', '', '', '', '09/16/2022', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '2',
'', '', '', '', 'EA', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '353.60', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '707.20', '\n'], ['', '', '', '', '2', '',
'', '', '', '', '', '', '1TD1131D17025-2036', '', '', '', '', '', '', '', '', '', '',
'', '', '', '09/16/2022', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', '', '', '', '', 'EA', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'353.60', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '707.20', '\n'], ['', '', '', '', '3', '', '', '', '', '', '', '',
'1TD1131D17025-2037', '', '', '', '', '', '', '', '', '', '', '', '', '', '09/16/2022',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '2', '', '', '', '', 'EA', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '353.60', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '707.20',
'\n']]
Right now I am trying to pull out the first integer from each list.
Here is my sample code.
def find(n,e):
for line in range(len(line_nu)):
item = line_nu[n][e]
n += 1
return item_nu.append(item)
I'm getting an 'Index out of range' error.
I can call ' line_nu[0][4] ' outside of this loop, but using same numbers in def find() I get an error. I have also tried this as a while loop where I replace n with i and start count at 0. Same error.
End goal is to get each none '' in a list of its own.
Anyone know what I'm doing wrong?
assuming your list of lists has the name data, we can get the first integer of each sublist like:
data = [[...],[...],...]
for list in data:
for item in list:
if item.isdigit():
print(item)
break
I interpreted your question as looking for the nth non-empty element in each sub list, so elements != ''. This function will execute that and return an array of the nth non-empty element in each sub array
exl = [['', '', '', '', '1', '', '', '', '', '', '', '', '1TD1131D17025-2035', '', '', '',
'', '', '', '', '', '', '', '', '', '', '09/16/2022', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '2',
'', '', '', '', 'EA', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '353.60', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '707.20', '\n'], ['', '', '', '', '2', '',
'', '', '', '', '', '', '1TD1131D17025-2036', '', '', '', '', '', '', '', '', '', '',
'', '', '', '09/16/2022', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', '', '', '', '', 'EA', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'353.60', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '707.20', '\n'], ['', '', '', '', '3', '', '', '', '', '', '', '',
'1TD1131D17025-2037', '', '', '', '', '', '', '', '', '', '', '', '', '', '09/16/2022',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '2', '', '', '', '', 'EA', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '353.60', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '707.20',
'\n']]
def find(n,listOfLists):
result = []
for list in listOfLists:
discoveredItems = 0
for item in list:
if item != '':
discoveredItems += 1
if discoveredItems == n:
result.append(item)
break
return result
So to get the first non-empty item in each list (the integers you mentioned above)...
find(1,exl)
# ['1', '2', '3']
And the dates (aka 3rd non-empty element):
find(3,exl)
# ['09/16/2022', '09/16/2022', '09/16/2022']
You'll need two for loops. For example:
test = [['', '', '', '', '1', '', '', '', '', '', '', '', '1TD1131D17025-2035', '', '', '',
'', '', '', '', '', '', '', '', '', '', '09/16/2022', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '2',
'', '', '', '', 'EA', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '353.60', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '707.20', '\n'], ['', '', '', '', '2', '',
'', '', '', '', '', '', '1TD1131D17025-2036', '', '', '', '', '', '', '', '', '', '',
'', '', '', '09/16/2022', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', '', '', '', '', 'EA', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'353.60', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '707.20', '\n'], ['', '', '', '', '3', '', '', '', '', '', '', '',
'1TD1131D17025-2037', '', '', '', '', '', '', '', '', '', '', '', '', '', '09/16/2022',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '2', '', '', '', '', 'EA', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '353.60', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '707.20',
'\n']
for row in range(len(test)):
for col in range(len(test[row])):
print(test[row][col])
If the numbers are always at the same index for all sublists, you can get a list of the numbers (as strings) by looping once:
def find(lists, n):
out = []
for lst in lists:
out.append(lst[n])
return out
# call it with n=4
numbers = find(mylists, 4)
Obviously, a list comprehension can more appropriate here
def find(lists, n):
return [lst[n] for lst in lists]
L = [['', '', '', '', '1', '', '', '', '', '', '', '', '1TD1131D17025-2035', '', '', '',
'', '', '', '', '', '', '', '', '', '', '09/16/2022', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '2',
'', '', '', '', 'EA', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '353.60', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '707.20', '\n'], ['', '', '', '', '2', '',
'', '', '', '', '', '', '1TD1131D17025-2036', '', '', '', '', '', '', '', '', '', '',
'', '', '', '09/16/2022', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', '', '', '', '', 'EA', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'353.60', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '707.20', '\n'], ['', '', '', '', '3', '', '', '', '', '', '', '',
'1TD1131D17025-2037', '', '', '', '', '', '', '', '', '', '', '', '', '', '09/16/2022',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '2', '', '', '', '', 'EA', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '353.60', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '707.20',
'\n']]
for _ in range(len(L)):
print(set(L[_]))
OUTPUT:
{'', '707.20', '353.60', '1TD1131D17025-2035', '2', 'EA', '09/16/2022', '1', '\n'}
{'', '707.20', '353.60', '2', 'EA', '09/16/2022', '1TD1131D17025-2036', '\n'}
{'', '707.20', '353.60', '2', 'EA', '09/16/2022', '1TD1131D17025-2037', '\n', '3'}

Selenium Python not able to extract text within all span tags

I am creating a small python program that automates 10fastfingers. In order to do that, I have to first extract all the words that I have to type. All these words are stored within span tags like this:
When I run my code, it just extracts the first 20-30 words rather than extracting all the words. Why is this so? Here is my code:
from selenium import webdriver
import time
url = "https://10fastfingers.com/typing-test/english"
browser = webdriver.Chrome("D:\\Python_Files\\Programs\\chromedriver.exe")
browser.get(url)
time.sleep(10)
count = 1
wordlst = []
while True:
try:
word = browser.find_element_by_xpath(f'//*[#id="row1"]/span[{count}]')
wordlst.append(word.text)
count += 1
except:
break
print(wordlst)
Output:
['them', 'how', 'said', 'light', 'show', 'seem', 'not', 'two', 'under', 'hear', 'them', 'there', 'about', 'face', 'us', 'change', 'year', 'only', 'leave', 'number', 'found', 'father', 'people', 'house', 'really', 'my', 'spell', 'when', 'look', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
How to solve this problem? Any help would be appreciated. Thanks!
You can do that with BeautifulSoup
from selenium import webdriver
import time
from bs4 import BeautifulSoup
url = "https://10fastfingers.com/typing-test/english"
browser = webdriver.Chrome("D:\\Python_Files\\Programs\\chromedriver.exe")
browser.get(url)
time.sleep(3)
html_soup = BeautifulSoup(browser.page_source, 'html.parser')
div = html_soup.find_all('div', id = 'row1')
wordlst=div[0].get_text().split()
browser.quit()
print(wordlst)
OR
to continue your approach,
from selenium import webdriver
import time
url = "https://10fastfingers.com/typing-test/english"
browser = webdriver.Chrome("D:\\Python_Files\\Programs\\chromedriver.exe")
browser.get(url)
time.sleep(6)
wordlst=browser.find_elements_by_xpath('//div[#id="row1"]/span')
wordlst=[x.get_attribute("innerHTML") for x in wordlst]
browser.quit()
print(wordlst)

Is it possible to return the correct values from a list of dictionaries based on user inputted subject ID?

I apologize for any formatting issues or unclear parts, I'm VERY new to Python and programming in general. I want to make a script that pulls a list of research participant records (the code provided here is sample data) and the list contains separate dictionary-like items that have all of the screening questions (including the record id, or subject ID). I want to pull particular items (self-harm reports and suicidal thoughts questions) from this depending on what the script's user inputs as the record id
I want the script to be able to pull from a growing list of dictionaries, so it has to index So far I have tried to return a tuple based on the user input, but it returns the same values
regardless of what I input for subj, it returns the same three values ('1', '2', '1'), the values of ONLY the first dictionary
from redcap import Project, RedcapError
URL = 'https://redcap.lib.umd.edu/api/'
#API KEY for sample data
API_KEY = 'B2E685118B86FA89F57C49A1C9A38BDC'
project = Project(URL, API_KEY)
all_data = project.export_records()
def find(subj, data):
index = 0
j = 0
for i in data:
for k,v in i.items():
if k == 'record_id' and v == subj:
index = j
j+=1
else:
j+=1
return data[index]['record_id'],data[index]['selfharm_18yr'],data[index]['talksaboutkillingself_18yr']
AN EXAMPLE OF DATA RECORD
[{'record_id': '1', 'child_gender': '', 'c_age': '', 'c_dob': '', 't_date': '', 'school_yn': '', 'school_grade': '', 'father_job': '', 'mother_work': '', 'parentgender': '', 'relation_to_child': '', 'other': '', 'no_sports': '', 'sport_a': '', 'average_time_a': '', 'average_skill_a': '', 'sport_b_yes': '', 'sport_b': '', 'average_time_b': '', 'average_skill_b': '', 'sport_c_yes': '', 'sport_c': '', 'average_time_c': '', 'average_skill_c': '', 'hobby_a_yes': '', 'hobby_a': '', 'hobby_a_time': '', 'hobby_a_skill': '', 'hobby_b_yes': '', 'hobby_b': '', 'hobby_b_time': '', 'hobby_b_skill': '', 'hobby_c_yes': '', 'hobby_c': '', 'hobby_c_time': '', 'hobby_c_skill': '', 'clubs': '', 'club1': '', 'activeclub1': '', 'clubs_2': '', 'club2': '', 'activeclub2': '', 'clubs_3': '', 'club3': '', 'activeclub3': '', 'chore_a_yes': '', 'chore_a': '', 'chore_a_skill': '', 'chore_b_yes': '', 'chore_b': '', 'chore_b_skill': '', 'chore_c_yes': '', 'chore_c': '', 'chore_c_skill': '', 'close_friends': '', 'friends': '', 'get_along_siblings': '', 'along_withkids': '', 'behave': '', 'play_work': '', 'attend_school': '', 'school_reason': '', 'performance1': '', 'performance2': '', 'performance3': '', 'performance4': '', 'othersubjects': '', 'other_subjects': '', 'performanceother': '', 'other2': '', 'other_subjects_2': '', 'performanceother_2': '', 'other3': '', 'other_subjects_3': '', 'performanceother_3': '', 'specialeducation': '', 'sp_ed': '', 'repeat_grades': '', 'repeat2': '', 'academic_problems': '', 'describe_problems': '', 'problems_date': '', 'problems_yn': '', 'end_problems': '', 'disabilities': '', 'disability2': '', 'concerns': '', 'best_things': '', 'too_young': '', 'alcohol': '', 'describe_alc18yr': '', 'argues': '', 'fails_finishing_things': '', 'enjoyment': '', 'bm': '', 'bragging': '', 'concentration': '', 'obsessions': '', 'describe_obesessions': '', 'restlessness': '', 'dependence': '', 'lonely': '', 'confusion': '', 'crying': '', 'cruelty_animals': '', 'cruelty': '', 'daydreams': '', 'selfharm_18yr': '2', 'attention': '', 'destruction': '', 'destruction2': '', 'disobedience': '', 'school_disobedience': '', 'eating_well': '', 'getting_along': '', 'guilt_misbehaving': '', 'jealousy': '', 'rule_breaking': '', 'fearful': '', 'describe_fears': '', 'fears_school': '', 'fears_thoughts': '', 'perfection': '', 'loveless': '', 'others_outtoget': '', 'worthlessness': '', 'accident_prone': '', 'fights': '', 'teasing': '', 'trouble_makers': '', 'voices': '', 'describe_voices': '', 'impulsive_acts': '', 'solitary': '', 'lying_cheating': '', 'fingernails': '', 'tense': '', 'movements': '', 'describe_movements': '', 'nightmares': '', 'likeability': '', 'constipation': '', 'fear_anxiety': '', 'dizziness': '', 'guilt': '', 'overeating': '', 'overtired': '', 'overweight': '', 'aches_pains': '', 'headaches': '', 'nausea': '', 'eye_problems': '', 'describe_eyes': '', 'skin_problems': '', 'stomach_aches': '', 'vomiting': '', 'other_conditions': '', 'describe_other': '', 'physical_violence': '', 'picks_skin': '', 'describe_skin': '', 'public': '', 'public2': '', 'school_work': '', 'coordination': '', 'older_kids': '', 'younger_kids': '', 'talking_refusal': '', 'compulsions': '', 'describe_compulsions': '', 'runs_away': '', 'screams': '', 'secretive': '', 'seeing_things': '', 'describe_seeingthings': '', 'self_conscious': '', 'sets_fires': '', 'sexual_problems': '', 'describe_sexualproblems': '', 'clowning': '', 'shy_timid': '', 'sleeps_less': '', 'sleeps_more': '', 'describe_sleeping': '', 'inattentive': '', 'speech_problems': '', 'describe_speechproblems': '', 'stares_blankly': '', 'steals_home': '', 'steals_outside': '', 'stores': '', 'describe_hoarding': '', 'strange_behavior': '', 'describe_strangebehavior': '', 'strange_ideas': '', 'describe_ideas': '', 'stubborn_sullen': '', 'mood_changes': '', 'sulking': '', 'suspicious': '', 'swearing_obscenities': '', 'talksaboutkillingself_18yr': '1', 'sleeptalking_walking': '', 'describe_sleeptalking': '', 'talks_toomuch': '', 'frequent_teasing': '', 'temper_tantrums': '', 'thinks_sex': '', 'threatens_people': '', 'thumb_sucking': '', 'smoking': '', 'sleeping_troubles': '', 'describe_sleepingtroubles': '', 'truancy': '', 'low_energy': '', 'depression': '', 'loud': '', 'uses_drugs': '', 'describe_drugusage': '', 'vandalism': '', 'wets_self': '', 'wets_bed': '', 'whining': '', 'opposite_sex': '', 'withdrawn': '', 'frequent_worries': '', 'additional_problems': '', 'problem_a': '', 'prob_a_true': '', 'problem_b_yes': '', 'problem_b': '', 'prob_b_true': '', 'problem_c_yes': '', 'problem_c': '', 'prob_c_true': ''}, {'record_id': '2', 'child_gender': '', 'c_age': '', 'c_dob': '', 't_date': '', 'school_yn': '', 'school_grade': '', 'father_job': '', 'mother_work': '', 'parentgender': '', 'relation_to_child': '', 'other': '', 'no_sports': '', 'sport_a': '', 'average_time_a': '', 'average_skill_a': '', 'sport_b_yes': '', 'sport_b': '', 'average_time_b': '', 'average_skill_b': '', 'sport_c_yes': '', 'sport_c': '', 'average_time_c': '', 'average_skill_c': '', 'hobby_a_yes': '', 'hobby_a': '', 'hobby_a_time': '', 'hobby_a_skill': '', 'hobby_b_yes': '', 'hobby_b': '', 'hobby_b_time': '', 'hobby_b_skill': '', 'hobby_c_yes': '', 'hobby_c': '', 'hobby_c_time': '', 'hobby_c_skill': '', 'clubs': '', 'club1': '', 'activeclub1': '', 'clubs_2': '', 'club2': '', 'activeclub2': '', 'clubs_3': '', 'club3': '', 'activeclub3': '', 'chore_a_yes': '', 'chore_a': '', 'chore_a_skill': '', 'chore_b_yes': '', 'chore_b': '', 'chore_b_skill': '', 'chore_c_yes': '', 'chore_c': '', 'chore_c_skill': '', 'close_friends': '', 'friends': '', 'get_along_siblings': '', 'along_withkids': '', 'behave': '', 'play_work': '', 'attend_school': '', 'school_reason': '', 'performance1': '', 'performance2': '', 'performance3': '', 'performance4': '', 'othersubjects': '', 'other_subjects': '', 'performanceother': '', 'other2': '', 'other_subjects_2': '', 'performanceother_2': '', 'other3': '', 'other_subjects_3': '', 'performanceother_3': '', 'specialeducation': '', 'sp_ed': '', 'repeat_grades': '', 'repeat2': '', 'academic_problems': '', 'describe_problems': '', 'problems_date': '', 'problems_yn': '', 'end_problems': '', 'disabilities': '', 'disability2': '', 'concerns': '', 'best_things': '', 'too_young': '', 'alcohol': '', 'describe_alc18yr': '', 'argues': '', 'fails_finishing_things': '', 'enjoyment': '', 'bm': '', 'bragging': '', 'concentration': '', 'obsessions': '', 'describe_obesessions': '', 'restlessness': '', 'dependence': '', 'lonely': '', 'confusion': '', 'crying': '', 'cruelty_animals': '', 'cruelty': '', 'daydreams': '', 'selfharm_18yr': '3', 'attention': '', 'destruction': '', 'destruction2': '', 'disobedience': '', 'school_disobedience': '', 'eating_well': '', 'getting_along': '', 'guilt_misbehaving': '', 'jealousy': '', 'rule_breaking': '', 'fearful': '', 'describe_fears': '', 'fears_school': '', 'fears_thoughts': '', 'perfection': '', 'loveless': '', 'others_outtoget': '', 'worthlessness': '', 'accident_prone': '', 'fights': '', 'teasing': '', 'trouble_makers': '', 'voices': '', 'describe_voices': '', 'impulsive_acts': '', 'solitary': '', 'lying_cheating': '', 'fingernails': '', 'tense': '', 'movements': '', 'describe_movements': '', 'nightmares': '', 'likeability': '', 'constipation': '', 'fear_anxiety': '', 'dizziness': '', 'guilt': '', 'overeating': '', 'overtired': '', 'overweight': '', 'aches_pains': '', 'headaches': '', 'nausea': '', 'eye_problems': '', 'describe_eyes': '', 'skin_problems': '', 'stomach_aches': '', 'vomiting': '', 'other_conditions': '', 'describe_other': '', 'physical_violence': '', 'picks_skin': '', 'describe_skin': '', 'public': '', 'public2': '', 'school_work': '', 'coordination': '', 'older_kids': '', 'younger_kids': '', 'talking_refusal': '', 'compulsions': '', 'describe_compulsions': '', 'runs_away': '', 'screams': '', 'secretive': '', 'seeing_things': '', 'describe_seeingthings': '', 'self_conscious': '', 'sets_fires': '', 'sexual_problems': '', 'describe_sexualproblems': '', 'clowning': '', 'shy_timid': '', 'sleeps_less': '', 'sleeps_more': '', 'describe_sleeping': '', 'inattentive': '', 'speech_problems': '', 'describe_speechproblems': '', 'stares_blankly': '', 'steals_home': '', 'steals_outside': '', 'stores': '', 'describe_hoarding': '', 'strange_behavior': '', 'describe_strangebehavior': '', 'strange_ideas': '', 'describe_ideas': '', 'stubborn_sullen': '', 'mood_changes': '', 'sulking': '', 'suspicious': '', 'swearing_obscenities': '', 'talksaboutkillingself_18yr': '2', 'sleeptalking_walking': '', 'describe_sleeptalking': '', 'talks_toomuch': '', 'frequent_teasing': '', 'temper_tantrums': '', 'thinks_sex': '', 'threatens_people': '', 'thumb_sucking': '', 'smoking': '', 'sleeping_troubles': '', 'describe_sleepingtroubles': '', 'truancy': '', 'low_energy': '', 'depression': '', 'loud': '', 'uses_drugs': '', 'describe_drugusage': '', 'vandalism': '', 'wets_self': '', 'wets_bed': '', 'whining': '', 'opposite_sex': '', 'withdrawn': '', 'frequent_worries': '', 'additional_problems': '', 'problem_a': '', 'prob_a_true': '', 'problem_b_yes': '', 'problem_b': '', 'prob_b_true': '', 'problem_c_yes': '', 'problem_c': '', 'prob_c_true': ''}]
I expect it to output a truple of the three keys, depending on what the record id of the corresponding dictionary is, but it instead outputs the same thing regardless of the subject ID
AN EXAMPLE OF THE OUTPUT
find('1', all_data)
('1', '2', '1')
find('2', all_data)
('1', '2', '1')
In the future I also want to be able to send those to an Excel spreadsheet.
So in this case, you're doing a ton of unnecessary iteration. The beauty of python dictionaries is that they're hashed and optimized for lookup operations.
Rather than iterating through keys and values, all you need to do is supply the key as the index and return early if the record exists. (Note, I changed a few names around for clarity, and to ensure that things like find() don't shadow built-in methods from other classes)
def find_item(subj, data):
for subdict in data:
if subdict['record_id']== subj:
return subdict['record_id'],subdict['selfharm_18yr'],subdict['talksaboutkillingself_18yr']
return "No Records Found"
find_item('1',data)
('1', '2', '1')
find_item('2',data)
('2', '3', '2')
find_item('zyzzyx',data)
"No records found"
And, regarding your function, here's where I believe the problem lies:
if k == 'record_id' and v == subj:
index = j
j+=1
else:
j+=1
In the case of the provided list of 2 records, this means you're setting index==0 before you update j, so even if the record is found at i[1], you still return the values from i[0]

How to correctly parse HTML to Unicode strings with pandas?

I'm running a Python program which fetches a UTF-8-encoded web page, and I extract some text from HTML table using pandas(read_html) and write result to csv file
However, when I write this text to a file,all spaces in it gets written in an unexpected encoding (example \xd0\xb9\xd1\x82\xd0\xb8).
to solve the problem I added a line i = i.split(" ")
after, all spaces in csv file substitutes for characters, the example below:
['0', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '1', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', '', '', '3\n0', '', '', '', '', '', '', '', 'number', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'last name', '', 'number', 'plan', 'NaN\n1', '', '', '', '', '', '', '', '', '', 'NaN', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'NaN', '', '', 'not', 'NaN\n2', '', '', '', '', '53494580', '', '', '', '', '', '', '', '', '', '+', '(53)494580', '', '', '', '', '', '', '', '', 'NP_551', 'NaN\n3', '', '', '', '', '53494581', '', '', '', '', '', '', '', '', '', '+', '(53)494581', '', '', '', '', '', '', '', '', 'NP_551', 'NaN\n4', '', '', '', '']
I would like to get rid of character ( '', ) Is there a way to fix this?
Any pointers would be much appreciated.
code python:
import pandas as pd
import html5lib
filename="1.csv"
file=open(filename,"w",encoding='UTF-8', newline='\n');
output=csv.writer(file, dialect='excel',delimiter =' ')
r = requests.get('http://10.45.87.12/og?sh=1&CallerName=&Sys=.79.83.86.51&')
pd.set_option('max_rows',10000)
df = pd.read_html(r.content)
for i in df:
i = str(i)
i = i.strip()
i = i.encode('UTF-8').decode('UTF-8')
i = i.split(" ")
output.writerow(i)
file.close()
You can use the filter method to remove of empty values. you can add the below snippet after 'i = i.split(" ")'
A = ['0', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '1', '', '', '', '', '', '', '', '', '', '', '', '', '', '2', '', '', '3\n0', '', '', '', '', '', '', '', 'number', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'last name', '', 'number', 'plan', 'NaN\n1', '', '', '', '', '', '', '', '', '', 'NaN', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'NaN', '', '', 'not', 'NaN\n2', '', '', '', '', '53494580', '', '', '', '', '', '', '', '', '', '+', '(53)494580', '', '', '', '', '', '', '', '', 'NP_551', 'NaN\n3', '', '', '', '', '53494581', '', '', '', '', '', '', '', '', '', '+', '(53)494581', '', '', '', '', '', '', '', '', 'NP_551', 'NaN\n4', '', '', '', '']
print filter(None, A)
Output:
['0', '1', '2', '3\n0', 'number', 'last name', 'number', 'plan', 'NaN\n1', 'NaN', 'NaN', 'not', 'NaN\n2', '53494580', '+', '(53)494580', 'NP_551', 'NaN\n3', '53494581', '+', '(53)494581', 'NP_551', 'NaN\n4']

Importing CSV from URL and displaying rows on Python by using Requests

import csv
import requests
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv')
reader=csv.reader(webpage)
for row in reader:
print(row)
Hi, I'm new to Python and I'm trying to open a CSV file from a URL & then display the rows so I can take the data that I need from it. However, the I get an error saying :
Traceback (most recent call last):
File "", line 1, in
for row in reader: Error: iterator should return strings, not bytes (did you open the file in text mode?)
Thank you in advance.
You can try this:
import csv, requests
webpage=requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv')
reader = csv.reader(webpage.content.splitlines())
for row in reader:
print(row)
Hope this will help
Use .text as you are getting bytes returned in python3:
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv')
reader = csv.reader([webpage.text])
for row in reader:
print(row)
That gives _csv.Error: new-line character seen in unquoted field so split the lines after decoding, also stream=True will allow you to get the data in chunks not all at once so you can filter by row and write:
import csv
import requests
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv', stream=1)
for line in webpage:
print(list(csv.reader((line.decode("utf-8")).splitlines()))[0])
Which gives you:
['Day Ahead Hourly LMP Values for 20160427', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['00', '600', '700', '800', '900', '1000', '1100', '1200', '1300', '1400', '1500', '1600', '1700', '1800', '1900', '2000', '2100', '2200', '2300', '2400', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['1', '25.13', '25.03', '28.66', '25.94', '21.74', '19.47', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['600', '600', '600', '700', '700', '700', '800', '800', '800', '900', '900', '900', '1000', '1000', '1000', '1100', '1100', '1100', '1200', '1200', '1200', '1300', '1300', '1300', '1400', '1400', '1400', '1500', '']
['1500', '1500', '1600', '1600', '1600', '1700', '1700', '1700', '1800', '1800', '1800', '1900', '1900', '1900']
['', '2000', '2000', '2000', '2100', '2100', '2100', '2200', '2200', '2200', '2300', '2300', '2300', '2400', '2400', '2400', '']
['lLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'Tot']
['alLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'To']
['talLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'TotalLMP', 'CongestionPrice', 'MarginalLossPrice', 'T']
.......................................
A variation on the answer by Padriac Cunningham uses iter_lines() from Requests and decodes each line using a list comprehension
import csv
import requests
webpage = requests.get('http://www.pjm.com/pub/account/lmpda/20160427-da.csv', stream = True)
webpage_decoded = [line.decode('utf-8') for line in webpage.iter_lines()]
reader = csv.reader(webpage_decoded)
or even simpler, you can have iter_lines() do the decoding
webpage_decoded = webpage.iter_lines(decode_unicode=True)

Categories

Resources