Fetch all data from mongodb - python

I'm trying to fetch all the data i have in my mongodb collection and for some reason i can't do it.
I can get more than one result but if i try to get more than x results, it stops working.
I'm using Flask, MongoDB, pymongo to work with mongodb and React.
This is my Flask function.
# app.route("/escoller-centro-proba", methods=["POST"])
# cross_origin()
def search_proba():
if request.method == "POST":
centros = []
resultadosPing = []
resultadosNmap = []
codigo = request.json['codigo'].upper()
query = {"centro": {"$regex": codigo}}
resultados = collection.find(query)
for resultado in resultados:
centroId = str(resultado["_id"])
centros.append({"_id": centroId, "sf": resultado["sf"], "centro": resultado["centro"], "concello": resultado["concello"], "lan": resultado["lan"], "dhcp": resultado["dhcp"],
"tecnoloxia": resultado["tecnoloxia"], "tecnoloxia_respaldo": resultado["tecnoloxia_respaldo"], "eva": resultado["eva"]})
if len(centros) > 1:
return jsonify({"centros": centros, "resultadosPing": resultadosPing, "resultadosNmap": resultadosNmap})
return jsonify({"centro": centros[0], "resultadosPing": resultadosPing, "resultadosNmap": resultadosNmap})
else:
return "Método non POST"
And here the JS function.
const escollerCentro = async (e) => {
e.preventDefault()
const res = await instance.post("http://127.0.0.1:5000/escoller-centro-proba", {
codigo: codigo.trim().toUpperCase()
})
console.log(res.data)
if (res.data.centro === "O centro non existe") {
setError("O centro non existe")
setIsError(true)
return;
}
if (res.data.centros) {
tabsInfoVar[value].cras = res.data.centros
tabsInfoVar[value].centro = {
centro: "",
resultadosPing: [],
resultadosNmap: []
}
}
if (res.data.centro) {
tabsInfoVar[value].centro = {
img: img,
centro: res.data.centro.centro,
index: res.data.centro._id,
concello: res.data.centro.concello,
lan: res.data.centro.lan,
dhcp: res.data.centro.dhcp ? "Si" : "Non",
sf: res.data.centro.sf,
tecnoloxia: res.data.centro.tecnoloxia,
tecnoloxia_respaldo: res.data.centro.tecnoloxia_respaldo,
eva: res.data.centro.eva,
resultadosPing: [],
resultadosNmap: []
}
tabsInfoVar[value].cras = []
}
tabsInfoVar[value].resultadosPing = res.data.resultadosPing
tabsInfoVar[value].resultadosNmap = res.data.resultadosNmap
const resultadosPing = []
for (var i = 0; i < tabsInfoVar[value].resultadosPing.length; i++) {
if(tabsInfoVar[value].resultadosPing[i] !== null) {
resultadosPing.push(tabsInfoVar[value].resultadosPing[i])
} else {
console.log("Resultado con valor nulo")
}
}
const resultadosNmap = []
for (var i = 0; i < tabsInfoVar[value]?.resultadosNmap.length; i++) {
if(tabsInfoVar[value].resultadosNmap[i] !== null) {
resultadosNmap.push(tabsInfoVar[value].resultadosNmap[i])
} else {
console.log("Resultado con valor nulo")
}
}
tabsInfoVar[value].resultadosPing = resultadosPing;
tabsInfoVar[value].resultadosNmap = resultadosNmap;
setTabsInfo([...tabsInfoVar])
}
As I said, if i fetch less than 13 results, the code works. I get an array from the database and my frontend can work with it. Here an example: I searched "RIANXO" and it shows me all the results that contains "Rianxo".
Here I'm searching "CRA", it should show an array of 168 results. Instead, i get this:
It shows all the results but not an array and as you can see, there is a label ("Show more") that I have to press if I want to see all the data.
I think it is a problem with mongodb, because i did exactly this but working with excel instead of mongodb and had no problem fetching all the data, 1275 results.
Thank you all.

I have found the problem. When i saved all the data from excel to mongodb, empty cells filled with NaN value and when the frontend was trying to get the attribute it was displaying the data in the way i have already shown.
The answer was to modify the value of those cells with df = df.fillna (" ").
Here is the code:
for index, centro in enumerate(centros):
data = df[df["Centro"].str.contains(centro)]
id = df.iloc[index]["ID"]
centro = df.iloc[index]["Centro"]
concello = df.iloc[index]["Concello"]
lan = df.iloc[index]["LAN"]
dhcp = df.iloc[index]["DHCP"]
tecnoloxia = df.iloc[index]["Tecnoloxía Acceso Principal"]
tecnoloxia_respaldo = df.iloc[index]["Tecnoloxía Acceso Respaldo"]
eva = "Non"
if dhcp == "Si":
dhcp = True
else:
dhcp = False
lan = format_lan(lan)
df = df.fillna("")
if tecnoloxia_respaldo == "":
tecnoloxia_respaldo = "Non ten liña de backup"
centroNovo = {"sf": id, "centro": centro, "concello": concello, "lan": lan, "dhcp": dhcp,
"tecnoloxia": tecnoloxia, "tecnoloxia_respaldo": tecnoloxia_respaldo, "eva": eva}
print(centroNovo)
collection.insert_one(centroNovo)

Related

Python GRPC 13 Internal Error when trying to yield response

When I print the response, everything seems to be correct, and the type is also correct.
Assertion: True
Response type: <class 'scrape_pb2.ScrapeResponse'>
But on postman I get "13 INTERNAL" With no additional information:
I can't figure out what the issue is, and I can't find out how to log or print the error from the server side.
Relevant proto parts:
syntax = "proto3";
service ScrapeService {
rpc ScrapeSearch(ScrapeRequest) returns (stream ScrapeResponse) {};
}
message ScrapeRequest {
string url = 1;
string keyword = 2;
}
message ScrapeResponse {
oneof result {
ScrapeSearchProgress search_progress = 1;
ScrapeProductsProgress products_progress = 2;
FoundProducts found_products = 3;
}
}
message ScrapeSearchProgress {
int32 page = 1;
int32 total_products = 2;
repeated string product_links = 3;
}
scraper.py
def get_all_search_products(search_url: str, class_keyword: str):
search_driver = webdriver.Firefox(options=options, service=service)
search_driver.maximize_window()
search_driver.get(search_url)
# scrape first page
product_links = scrape_search(driver=search_driver, class_keyword=class_keyword)
page = 1
search_progress = ScrapeSearchProgress(page=page, total_products=len(product_links), product_links=[])
search_progress.product_links[:] = product_links
# scrape next pages
while go_to_next_page(search_driver):
page += 1
print(f'Scraping page=>{page}')
product_links.extend(scrape_search(driver=search_driver, class_keyword=class_keyword))
print(f'Number of products scraped=>{len(product_links)}')
search_progress.product_links.extend(product_links)
# TODO: remove this line
if page == 6:
break
search_progress_response = ScrapeResponse(search_progress=search_progress)
yield search_progress_response
Server:
class ScrapeService(ScrapeService):
def ScrapeSearch(self, request, context):
print(f"Request received: {request}")
scrape_responses = get_all_search_products(search_url=request.url, class_keyword=request.keyword)
for response in scrape_responses:
print(f"Assertion: {response.HasField('search_progress')}")
print(f"Response type: {type(response)}")
yield response
Turns out it's just an issue with postman. I set up a python client and it worked.

How to get progress of file upload using requests.post() if the file was just uploaded via form-data? (Not sure whether this is a streaming process)

Suppose I'm running some kind of web service with python & flask aiming to provide file upload. It should be noted that my service is only a kind of transfer station, which means I'll "repost" the file elsewhere, say, a massive centralized file storage, as soon as I receive a file from frontend form-data. The flask code looks like this:
#admin.route('/data', methods=['POST'])
def data_upload():
if 'file' in request.files:
f = request.files['file']
try:
r = Requests.post(DataConfig.server + '/' + '/upload.php', files={'file': (f.filename, f)})
return {'e': 0, 'msg': r.content.decode('utf-8')}
except RequestException:
return ReturnCode.ERR_NETWORK_FAILURE
return ReturnCode.ERR_BAD_DATA
It is not allowed that the frontend directly send file or data to "upload.php", since the server key should not be exposed to users.
Now I've got two problems that confuse me.
Is this process streaming or streaming-ready? I mean, whether my python server will receive the file and store it somewhere temporarily, or the file will be repost in chunks, like iter_block? If streaming is not readily enabled, how can I do so to enable large-file uploading user experience?
Is it possible to provide the user (frontend, browser) information about the progress of uploading?
For question 2, there exists some posts indicating that tqdm will help, but tqdm is a cli tool and printing anything in the python server's terminal makes no sense, and document of tqdm doesn't show me any obvious reference about their apis to get a percentage. What's more, I think that maybe sending this percentage to frontend may require some xhr techniques, which I believe my code can't deal with.
PART 1 OF MY ANSWER : THE JAVASCRIPT
async function ubox_send_chucks (divid) {
var chunstart = 0;
var chunend = 0;
var chunksize = 1024 * 1024 * 9;
var newblobchun_id = parseInt(Math.floor(Math.random() * 100000000) + 1);
fi = $("#"+divid).files;
stubuploads = fi.length;
for (var i = 0; i < fi.length; i++) {
var thismediaby_id = (newblobchun_id + i + 3);
$("#progressx").append("<div id=\'loaderz_"+thismediaby_id+"_message\' class=\'padding3 margin3\' >FILE "+i+" (PREP)</div>");
$("#phpx").append("<div id=\'phpz_"+thismediaby_id+"_message\' class=\'padding3 margin3\' >FILE "+i+" (PREP)</div>");
}
for (var i = 0; i < fi.length; i++) {
if ( fi[i].size > 0 ) {
var numberofchunks = Math.ceil( fi[i].size / chunksize );
var thismediaby_id = (newblobchun_id + i + 3);
logx("FILE "+i+" -- size: "+fi[i].size+" name: "+fi[i].name+" n of chunks to send: "+numberofchunks );
// SEND EACH CHUNKS
for (var c = 0; c <= numberofchunks; c++) {
chunstart = (c * chunksize); chunend = chunstart + chunksize + 1; if ( chunend > fi[i].size ){ chunend = fi[i].size; }
var thischunk = fi[i].slice(chunstart, chunend);
var thismediaby_name = thismediaby_id+"--chunk_"+c;
console.log("FILE "+i+" send chunk: "+c+" start: "+chunstart+" end: "+chunend);
upload_percent = ( c / numberofchunks ) * 100;
$("#loaderz_"+thismediaby_id+"_message").html("FILE "+i+" : " + Math.round(upload_percent) + " %");
var fd = new FormData();
fd.append("data", thischunk, encodeURIComponent(thismediaby_name));
fd.append("thismedia_id", encodeURIComponent(thismediaby_id));
fd.append("thismedia_name", encodeURIComponent(fi[i].name));
fd.append("numberofchunks", encodeURIComponent(numberofchunks));
fd.append("thischunk_number", encodeURIComponent(c));
fd.append("thisfilex", encodeURIComponent(i));
fd.append("thissession", encodeURIComponent(thissession));
fd.append("howmanyfiles", encodeURIComponent(fi.length));
var pcache = (Math.floor(Math.random() * 100000000) + 1);
await fetch("/templates/tests_ubox_slice/slice_receiver.php?pcache="+pcache, { method: "POST", body: fd })
.then(function (response) { return response.text(); })
.then(function (html) { $("#php_message").html(html); })
}
// WHEN ALL CHUNKS SENT TRIGGER A RECOMBINE (SAFER)
// AJAX FUNCTION HERE https://stubs.s3.filebase.com/media/stubs/202111100393/recookies.js
var combinex = [];
combinex["thismedia_id"] = encodeURIComponent(thismediaby_id);
combinex["thissession"] = encodeURIComponent(thissession);
combinex["thismedia_name"] = encodeURIComponent(fi[i].name);
stubajax("combiner","/templates/tests_ubox_slice/slice_combiner.php?pcache="+pcache,combinex);
}
}
}
PART II PHP RECEIVER
function clean () { ....stuff to make user input more secure like rawurldecode, html_entity_decode, stripslashes, etc }
$pcache = clean( $_REQUEST['pcache'] ?? '' ) ;
$thismedia_id = clean( $_REQUEST['thismedia_id'] ?? '' ) ;
$thismedia_name = clean( $_REQUEST['thismedia_name'] ?? '' );
$thismedia_ext = pathinfo($exif_file)['extension'] ?? '';
$numberofchunks = clean( $_REQUEST['numberofchunks'] ?? '' ) ;
$thischunk_number = clean( $_REQUEST['thischunk_number'] ?? '' ) ;
$thisfilex = clean( $_REQUEST['thisfilex'] ?? '' ) ;
$howmanyfiles = clean( $_REQUEST['howmanyfiles'] ?? '' ) ;
$thissession = clean( $_REQUEST['thissession'] ?? '' ) ;
if ( $pcache != '' ) {
// DEV
// var_dump(['thismedia_id'=>$thismedia_id,'thismedia_name'=>$thismedia_name,'numberofchunks'=>$numberofchunks,'thischunk_number'=>$thischunk_number,'thisfilex'=>$thisfilex]);
// WHERE TO SAVE CHUNKS
$received_chunk_dir = '/temporary_path_here_you_receiver_chunks/'.$thissession.'/'.$thismedia_id.'/';
$received_chunk_path = $received_chunk_dir.$thismedia_id.'_chunk_'.$thischunk_number;
// IF DIRECTORY NOT THERE CREATE IT
if ( !file_exists($received_chunk_dir) ) { shell_exec('mkdir -p "'.$received_chunk_dir.'"'); }
// MOVE_UPLOADED_FILE
foreach ($_FILES as $thisfilekey => $thisfilechun) {
if ( isset($thisfilechun['name']) && isset($thisfilechun['tmp_name']) ) {
if ( filesize($thisfilechun['tmp_name']) > 1 ) { move_uploaded_file($thisfilechun['tmp_name'],$received_chunk_path); }
}
}
// ECHO PERCENT PROGRESSION FOR THAT FILE
echo '<script>$("#phpz_'.$thismedia_id.'_message").html("FILE '.$thisfilex.' : received chunk number '.$thischunk_number.' of '.$numberofchunks.' chunks");</script>';
}
PART III PHP COMBINER
$pcache = clean( $_REQUEST['pcache'] ?? '' ) ;
$thismedia_id = clean( $_REQUEST['thismedia_id'] ?? '' ) ;
$thismedia_name = accentx(clean( $_REQUEST['thismedia_name'] ?? '' ),'unlock');
$thismedia_ext = exiftensionx(['exif_file'=>$thismedia_name]);
$numberofchunks = clean( $_REQUEST['numberofchunks'] ?? '' ) ;
$thischunk_number = clean( $_REQUEST['thischunk_number'] ?? '' ) ;
$thisfilex = clean( $_REQUEST['thisfilex'] ?? '' ) ;
$howmanyfiles = clean( $_REQUEST['howmanyfiles'] ?? '' ) ;
$thissession = clean( $_REQUEST['thissession'] ?? '' ) ;
if ( $thissession != '' ) {
// PATH
$received_chunk_dir = '/temporary_path_here_you_receiver_chunks/'.$thissession;
$received_final_path = $received_chunk_dir.'/'.$thismedia_id.'/'.$thismedia_id.'.'.$thismedia_ext;
// GET OF SORTED FILES -V because chunk_1, chunk_2, ...., chunk_9, chunk_10, chunk_11 ==> dont want chunk_1,chunk_10,chunk_2 but chunk_1,chunk_2,...,chunk_10
$all_chunks_raw = shell_exec('ls '.$received_chunk_dir.'/'.$thismedia_id.'/* | sort -V');
if ( $all_chunks_raw != '' ) {
// GET LS OF ALL CHUNKS
$all_chunks_explo = array_filter(explode(PHP_EOL,$all_chunks_raw));
// IF ONLY 1 CHUNK JUST RENAME
if ( count($all_chunks_explo) == 1 ) { rename($all_chunks_explo[0],$received_final_path); #unlink($all_chunks_explo[0]); }
else {
// RECOMBINE ALL CHUNKS WITH FOPEN FREAD chunksize = 1024 * 1024 * 9 = 9437184 from javascript HAS TO BE THE SAME VALUE
foreach ( $all_chunks_explo as $chunkey => $chunx ){
$file = fopen($chunx, 'rb'); $buff = fread($file, 9437184); fclose($file);
$final = fopen($received_final_path, 'ab'); $write = fwrite($final, $buff); fclose($final);
}
// DELETE CHUNKS AFTER COMBINE
shell_exec('ls '.$received_chunk_dir.'/'.$thismedia_id.' -name "*_chunk_*" -delete');
}
}
// HERE YOU CAN FFMPEG, IMAGEMAGICK, ETC TO CONVERT TO WEB COMPATIBLE FORMAT
// HERE YOU CAN SEND FILES TO S3 BUCKET (services like filebase.com)
// DELETE FILE AFTER SENDING TO S3 BUCKET IF YOU NEED TO CLEAR SPACE
echo '<script>$("#phpz_'.$thismedia_id.'_message").append(" ---COMBINATION DONE---");</script>';
}
******** NOTE : because of async: it's important to WAIT for all chuncks BEFORE combining because of networks (internet) factors, chunks don't always arrive one after an other, some times, you get chunk 1, then 3 then 6 then 2. it's why a ajax call sends a signal to combiner to "tell" ok, all chuncks are sent

when passed a dictionary into jinja2 template single apostrophe(') is converted into "'"

JavaScript is throwing an error 'Uncaught Syntax Error: Unexpected token '&''
when debugged in Views.py I got he data with proper Apostrophes.
def newEntry(request):
assert isinstance(request, HttpRequest)
i = 1
for x in lines:
for line in x:
cursor.execute("select distinct regionn FROM [XYZ].[dbo].[Errors] where [Linne] like '%" +line+ "%'")
region[i] = cursor.fetchall()
i = i+1
return render(
request,
'app/newEntry.html',
{
'title': 'New Entry',
'year':datetime.now().year,
'lines': lines,
'regions': region,
}
)
and here is my JS code
var Regions= {{regions}}
function changecat(value) {
if (value.length == 0) document.getElementById("category").innerHTML = "<option>default option here</option>";
else {
var catOptions = "";
for (categoryId in Regions[value]) {
catOptions += "<option>" + categoryId+ "</option>";
}
document.getElementById("category").innerHTML = catOptions;
}
}
Thanks in advance, if this is not a best practice to carry data, suggest me some best process which fills my requirement

Communicate Node.js with Python and fetch data delay

I'm trying to program web crawler.
I have server.js / crawling.js / dataCrawler.py
When I call crawlData that is defined in crawling.js at server.js, the method I defined in crawling.js using spawn for executing the dataCrawler.py gets called.
I need data in server.js, but executing dataCrawler.py takes a while So I cannot get proper data but null or undefined.
Do you have any solution ? or Anyone who has same issue?
My codes are below. (I don't put these perfectly. Just reference for structure)
//server.js
var crawler = require("./crawling")
var resultArr = crawler.crawlData();
console.log('nodeserver:', resultArr)
//crawling.js
exports.crawlData = ()=>{
var dataArr = [];
var temp;
var py = spawn('python', ['dataCrawler.py']);
var data = [totalUrl, gubun];
var dataFromPy = null;
py.stdout.on('data', function(result){
var dataArr = encoding.convert(result, 'utf-8')
dataArr = JSON.parse(encoding.convert(result, 'utf-8'));
py.stdout.on('end', function(){
temp = dataArr
});
});
py.stdin.write(JSON.stringify(data));
py.stdin.end();
return temp;
}
//dataCrawler.py
def crawling(url, gubun, page_count):
idx = 0
result = []
jsonData = {}
for i in range(1, page_count + 1):
....
crawling code
....
return result
def main():
lines = sys.stdin.readlines()
paraFromServer = json.loads(lines[0])
url = paraFromServer[0]
gubun = paraFromServer[1]
result = crawling(url, gubun, page_count)
print(result)
main()
You didn't account for the asynchronous nature of javascript. What you have to do is, pass in a callback method to crawlData method, which will be invoked once scraping is done.
exports.crawlData = (cb)=>{
....
py.stdout.on('data', function(result){
var dataArr = encoding.convert(result, 'utf-8')
dataArr = JSON.parse(encoding.convert(result, 'utf-8'));
py.stdout.on('end', function(){
cb(dataArr); // ideally the pattern is cb(error, data)
});
});
...
So server.js becomes:
var crawler = require("./crawling")
crawler.crawlData((data) => {
console.log(data);
// Do whatever you want to do with the data.
});
Callbacks can cause Callback hell. Try exploring promises or async / await.
Alternatively you can use spawnSync if running in parallel isn't a concern
exports.crawlData = () => {
const result = spawnSync('python', ['dataCrawler.py'], {
input: JSON.stringify([totalUrl, gubun])
});
return JSON.parse(encoding.convert(result, 'utf-8'));
}

CSV to elasticsearch with python SerializationError

When i try to send the bulk_data to the local elasticsearch, my data isn't loaded because of the SerializationError.
I already tried to fill the empty cells in the csv file, but that wasn't the solution.
from elasticsearch import Elasticsearch
bulk_data = []
header = []
count = 0
for row in csv_file_object:
if count > 0 :
data_dict = {}
for i in range(len(row)):
row = row.rstrip()
data_dict[header[i]] = row[i]
op_dict = {
"index": {
"_index": INDEX_NAME,
"_type": TYPE_NAME,
}
}
bulk_data.append(op_dict)
bulk_data.append(data_dict)
else:
header = row
count = count+1
# create ES client, create index
es = Elasticsearch(hosts = [ES_HOST])
if es.indices.exists(INDEX_NAME):
print("deleting '%s' index..." % (INDEX_NAME))
res = es.indices.delete(index = INDEX_NAME)
res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = True)
See image for the SerializationError and bulk_data values:
Please note: the \n is added by the serialization process itself.
I try to repond to you but I can't understand one thing. How you retrieve your field name from data? In your code I see that you retrieve it from a list called header that is empty? I can't understand how you take this value.. Check my answer i don't know if i understand well
from elasticsearch import Elasticsearch
from elasticsearch import helpers
index_name = "your_index_name"
doc_type = "your_doc_type"
esConnector = Elasticsearch(["http://192.168.1.1:9200/"])
# change your ip here
count = 0
def generate_data(csv_file_object)
with open(csv_file_object, "r") as f:
for line in f:
line = line.split(",").rstrip()
data_dict = {header[count]: line}
obj={
'_op_type': 'index',
'_index': index_name,
'_type': doc_type,
'_id': count+1,
'_source': data_dict
}
count +=1
yield obj
for success, info in helpers.parallel_bulk(client=esConnector, actions=generate_data(csv_file_object), thread_count=4):
if not success:
print 'Doc failed', info

Categories

Resources