I am not familiar with ElasticSearch but need to do a query on it.
I got some basics going and queried one record with the result below: This was using httpes://myelasticdb/_search?q=Uri%3a1067344 format:
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 13,
"successful": 13,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.0,
"hits": [
{
"_index": "cm_pp",
"_type": "_doc",
"_id": "1067344",
"_score": 1.0,
"_source": {
"AllContacts": [
30237
],
"Container": 1054260,
"Creator": 616,
"DateCreated": "2024-08-26 23:51:42",
"DateModified": "2024-08-26 23:51:43",
"DateRegistered": "2024-08-26 23:55:59",
"Document": {
"Filename": "0O6K0L640BL.PDF",
"Size": 97214,
"Status": {
"Indexed": false,
"Description": "Extraction Failed",
"Reason": "Failed"
},
"StoreID": "001+048X+0O6K0L640BL.PDF"
},
"Extension": "PDF",
"JoinField": "metadata",
"Number": "AR24/48700",
"ObjectType": "Record",
"OwnerLocation": 625,
"RecordType": 2,
"Title": "Test document",
"Uri": 1067344
}
}
]
}
}
Mapping
{
"cm_pp": {
"mappings": {
"dynamic": "false",
"properties": {
"AllContacts": {
"type": "long"
},
"Classification": {
"type": "long"
},
"Container": {
"type": "long"
},
"Creator": {
"type": "long"
},
"DateCreated": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
},
"DateModified": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
},
"DateRegistered": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
},
"Document": {
"properties": {
"ChildLevel": {
"type": "long"
},
"Content": {
"type": "text",
"fields": {
"phrase": {
"type": "text",
"analyzer": "std_anal\r\nyzer"
}
},
"analyzer": "stem_analyzer"
},
"Filename": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 1024
}
},
"analyzer": "std_analyzer"
},
"Index": {
"type": "keyword"
},
"ParentFiles": {
"type": "keyword",
"ignore_above": 1024
},
"Size": {
"type": "l\r\nong"
},
"Status": {
"properties": {
"Description": {
"type": "keyword"
},
"Indexed": {
"type": "boolean"
},
"Reason": {
"type": "keyword"
}
}
},
"StoreID": {
"type": "keyword"
}
}
},
"DocumentParts": {
"properties": {
"Count": {
"type": "long"
},
"Files": {
"type": "keyword",
"ignore_abov\r\ne": 1024
},
"TotalSize": {
"type": "long"
}
}
},
"Extension": {
"type": "text",
"fields": {
"phrase": {
"type": "text",
"analyzer": "std_analyzer"
}
},
"analyzer": "stem_analyzer"
},
"ExternalReference": {
"type": "keyword"
},
"JoinField": {
"type": "join",
"eager_global_ordinals": true,
"relations": {
"metadata": "content"
}
},
"Notes": {
"type": "text",
"fields": {
"phrase": {
"type": "text",
"analyzer": "std_analyzer"
}
},
"analyzer": "stem_analyzer"
},
"Number": {
"type": "keyword"
},
"ObjectType": {
"type": "keyword"
},
"OwnerLocation": {
"type": "long"
},
"RecordType": {
"type": "long"
},
"Title": {
"type": "text",
"fields": {
"phrase": {
"type": "text",
"analyzer": "std_analyzer"
}
},
"analyzer": "stem_analyzer"
},
"Uri": {
"type": "long"
}
}
}
} }
I now want to query this bit to find all the documents that failed to index. They are PDF's that weren't OCR'd.
"Status": {"Indexed": false, "Description": "Extraction Failed", "Reason": "Failed" },
I have tried a few things, but my lack of ElasticSearch knowledge means I am hitting a brick wall and getting no results. I am using curl from a Windows box.
Attempted queries that return no results:
{"query": {"term": {"Reason": "Failed"}}}
{"query": {"term": {"Document.Status.Reason": "Failed"}}}
{"query": {"term": {"Document.Status.Indexed": false}}}
Result: {"took":6,"timed_out":false,"_shards":{"total":3,"successful":3,"skipped":0,"failed":0},"hits":{"total":{"value":0,"relation":"eq"},"max_score":null,"hits":[]}}
Is anyone able to assist?
thanks jc
EDIT:
I got results with this:
{ "query" : { "match" : { "Document.Status.Indexed": false } } }
Now trying to filter by extension field but getting no results:
{ \"query\" : {
\"bool\": {
\"must\": [
{
\"match\" : {
\"Document.Status.Indexed\": false } },
{
\"match\" : {
\"Document.Extension\": \"PDF\" } }
]
}}}