Initialiser les données de test:
CREATE EXTENSION IF NOT EXISTS pgcrypto;
CREATE TABLE docs (data JSONB NOT NULL DEFAULT '{}');
-- generate 200k documents, ~half with type: "type1" and another half with type: "type2", unique incremented index and random uuid per each row
INSERT INTO docs (data)
SELECT json_build_object('id', gen_random_uuid(), 'type', (CASE WHEN random() > 0.5 THEN 'type1' ELSE 'type2' END) ,'index', n)::JSONB
FROM generate_series(1, 200000) n;
-- inset one more row with explicit uuid to query by it later
INSERT INTO docs (data) VALUES (json_build_object('id', '30e84646-c5c5-492d-b7f7-c884d77d1e0a', 'type', 'type1' ,'index', 200001)::JSONB);
Première requête - filtrer par données-> type et limite:
-- FAST ~19ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
LIMIT 25;
/* "Limit (cost=0.00..697.12 rows=25 width=90) (actual time=0.029..0.070 rows=25 loops=1)"
" -> Seq Scan on docs (cost=0.00..5577.00 rows=200 width=90) (actual time=0.028..0.061 rows=25 loops=1)"
" Filter: (data @> '{"type": "type1"}'::jsonb)"
" Rows Removed by Filter: 17"
"Planning time: 0.069 ms"
"Execution time: 0.098 ms"
*/
Deuxième requête - filtrer par données-> type, classer par données-> index et limite
-- SLOW ~250ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
ORDER BY data->'index' -- added ORDER BY
LIMIT 25;
/* "Limit (cost=5583.14..5583.21 rows=25 width=90) (actual time=236.750..236.754 rows=25 loops=1)"
" -> Sort (cost=5583.14..5583.64 rows=200 width=90) (actual time=236.750..236.750 rows=25 loops=1)"
" Sort Key: ((data -> 'index'::text))"
" Sort Method: top-N heapsort Memory: 28kB"
" -> Seq Scan on docs (cost=0.00..5577.50 rows=200 width=90) (actual time=0.020..170.797 rows=100158 loops=1)"
" Filter: (data @> '{"type": "type1"}'::jsonb)"
" Rows Removed by Filter: 99842"
"Planning time: 0.075 ms"
"Execution time: 236.785 ms"
*/
Troisième requête - identique à la deuxième (précédente) mais avec un index btree sur data-> index:
CREATE INDEX docs_data_index_idx ON docs ((data->'index'));
-- FAST ~19ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
ORDER BY data->'index' -- added BTREE index on this field
LIMIT 25;
/* "Limit (cost=0.42..2473.98 rows=25 width=90) (actual time=0.040..0.125 rows=25 loops=1)"
" -> Index Scan using docs_data_index_idx on docs (cost=0.42..19788.92 rows=200 width=90) (actual time=0.038..0.119 rows=25 loops=1)"
" Filter: (data @> '{"type": "type1"}'::jsonb)"
" Rows Removed by Filter: 17"
"Planning time: 0.127 ms"
"Execution time: 0.159 ms"
*/
Quatrième requête - filtrer maintenant par data-> id et limit = 1:
-- SLOW ~116ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> ('{"id": "30e84646-c5c5-492d-b7f7-c884d77d1e0a"}')::JSONB -- querying by "id" field now
LIMIT 1;
/* "Limit (cost=0.00..27.89 rows=1 width=90) (actual time=97.990..97.990 rows=1 loops=1)"
" -> Seq Scan on docs (cost=0.00..5577.00 rows=200 width=90) (actual time=97.989..97.989 rows=1 loops=1)"
" Filter: (data @> '{"id": "30e84646-c5c5-492d-b7f7-c884d77d1e0a"}'::jsonb)"
" Rows Removed by Filter: 189999"
"Planning time: 0.064 ms"
"Execution time: 98.012 ms"
*/
Cinquième requête - identique à la quatrième mais avec un index gin (json_path_ops) sur les données:
CREATE INDEX docs_data_idx ON docs USING GIN (data jsonb_path_ops);
-- FAST ~17ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> '{"id": "30e84646-c5c5-492d-b7f7-c884d77d1e0a"}'::JSONB -- added gin index with json_path_ops
LIMIT 1;
/* "Limit (cost=17.55..20.71 rows=1 width=90) (actual time=0.027..0.027 rows=1 loops=1)"
" -> Bitmap Heap Scan on docs (cost=17.55..649.91 rows=200 width=90) (actual time=0.026..0.026 rows=1 loops=1)"
" Recheck Cond: (data @> '{"id": "30e84646-c5c5-492d-b7f7-c884d77d1e0a"}'::jsonb)"
" Heap Blocks: exact=1"
" -> Bitmap Index Scan on docs_data_idx (cost=0.00..17.50 rows=200 width=0) (actual time=0.016..0.016 rows=1 loops=1)"
" Index Cond: (data @> '{"id": "30e84646-c5c5-492d-b7f7-c884d77d1e0a"}'::jsonb)"
"Planning time: 0.095 ms"
"Execution time: 0.055 ms"
*/
Sixième (et dernière) requête - identique à la troisième requête (requête par données-> type, ordre par données-> index, limite):
-- SLOW AGAIN! ~224ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
ORDER BY data->'index'
LIMIT 25;
/* "Limit (cost=656.06..656.12 rows=25 width=90) (actual time=215.927..215.932 rows=25 loops=1)"
" -> Sort (cost=656.06..656.56 rows=200 width=90) (actual time=215.925..215.925 rows=25 loops=1)"
" Sort Key: ((data -> 'index'::text))"
" Sort Method: top-N heapsort Memory: 28kB"
" -> Bitmap Heap Scan on docs (cost=17.55..650.41 rows=200 width=90) (actual time=33.134..152.618 rows=100158 loops=1)"
" Recheck Cond: (data @> '{"type": "type1"}'::jsonb)"
" Heap Blocks: exact=3077"
" -> Bitmap Index Scan on docs_data_idx (cost=0.00..17.50 rows=200 width=0) (actual time=32.468..32.468 rows=100158 loops=1)"
" Index Cond: (data @> '{"type": "type1"}'::jsonb)"
"Planning time: 0.157 ms"
"Execution time: 215.992 ms"
*/
Il semble donc que la sixième requête (identique à la troisième) soit beaucoup plus lente lorsqu'il y a un index gin sur la colonne de données. C'est probablement parce qu'il n'y a pas beaucoup de valeurs distinctes pour le champ data-> type (seulement "type1" ou "type2")? Qu'est-ce que je peux y faire? J'ai besoin de gin index pour faire d'autres requêtes qui en profitent ...