Skip to content

Instantly share code, notes, and snippets.

@hi-ogawa
Last active February 21, 2022 04:26
Show Gist options
  • Save hi-ogawa/f2dd4ec647a5c8c755aca5dcb3a86a0c to your computer and use it in GitHub Desktop.
Save hi-ogawa/f2dd4ec647a5c8c755aca5dcb3a86a0c to your computer and use it in GitHub Desktop.
Reading PostgreSQL

todo / summary

specific examples

  • insert
  • insert on conflict
  • update
  • delete
  • select
    • scan
      • seqscan
      • indexscan
      • bitmapscan
    • join (@)
      • nested loop
    • group by
    • order by
      • "implicitly sorted" index scan (cf. tuplesort.sql)
    • limit, offset
    • distince, distinct on
  • index
    • btree
    • gin
      • array containment
      • array intersection
      • multiple columns
    • composite index
    • partial index
  • explain
  • migration query
  • vacuume

references

hacking

# out-of-tree build
mkdir -p Debug && cd Debug
CC=clang CFLAGS=-g ../configure  # it adds -O2 without explicit CFLAGS
make

# run test
make check  # run default test suite `src/test/regress/parallel_schedule`
make check-tests EXTRA_TESTS=gin  # run specific tests

# run postgres from "tmp_install"
make temp-install
rm -rf tmp && mkdir -p tmp
LD_LIBRARY_PATH="$PWD/tmp_install/usr/local/pgsql/lib" ./tmp_install/usr/local/pgsql/bin/initdb -D ./tmp/data --no-clean --no-sync
LD_LIBRARY_PATH="$PWD/tmp_install/usr/local/pgsql/lib" ./tmp_install/usr/local/pgsql/bin/postgres --single -D ./tmp/data postgres
# look for nice documentation
find . -type f -name README
./src/backend/utils/mmgr/README
./src/backend/utils/fmgr/README
./src/backend/nodes/README
./src/backend/executor/README
./src/backend/access/transam/README
./src/backend/access/gin/README
./src/backend/access/nbtree/README
./src/backend/optimizer/README
./src/backend/parser/README
  • .vscode/c_cpp_properties.json
    • Debug/src/include is needed for some auto generated headers
{
  "version": 4,
  "configurations": [
    {
      "name": "Linux",
      "includePath": [
        "${workspaceFolder}/src/include",
        "${workspaceFolder}/Debug/src/include"
      ],
      "defines": [],
      "compilerPath": "/usr/bin/clang",
      "cStandard": "c17",
      "cppStandard": "c++14",
      "intelliSenseMode": "linux-clang-x64"
    }
  ]
}
  • .vscode/launch.json
    • See vadimcn/codelldb#492 for "Operation not permitted" error
    • follow-fork-mode is not available for lldb (convenient for debugging into PostgresMain from PostmasterMain)
    • Probably vscode-lldb is not useful but just keep them here just in case
{
  "version": "0.2.0",
  "configurations": [
    {
      "name": "(gdb) attach postgres",
      "type": "cppdbg",
      "request": "attach",
      "program": "${workspaceFolder}/Debug/tmp_install/usr/local/pgsql/bin/postgres",
      "processId": "${command:pickProcess}",
      "externalConsole": false,
      "MIMode": "gdb",
      "miDebuggerPath": "/usr/bin/gdb",
      "setupCommands": [
        { "text": "set follow-fork-mode child" },
        { "text": "-enable-pretty-printing" }
      ]
    },
    {
      "name": "(lldb) attach postgres",
      "type": "lldb",
      "request": "attach",
      "program": "${workspaceFolder}/Debug/tmp_install/usr/local/pgsql/bin/postgres",
      "pid": "${command:pickMyProcess}"
    },
    {
      "name": "(lldb) launch pg_regress",
      "type": "lldb",
      "request": "launch",
      "cwd": "${workspaceFolder}/Debug/src/test/regress",
      "program": "${workspaceFolder}/Debug/src/test/regress/pg_regress",
      "args": [
        "--temp-instance=./tmp_check",
        "--inputdir=${workspaceFolder}/src/test/regress",
        "--bindir=",
        "--dlpath=.",
        "--max-concurrent-tests=20",
        "--schedule=${workspaceFolder}/src/test/regress/parallel_schedule"
      ],
      "env": {
        "PATH": "${workspaceFolder}/Debug/tmp_install/usr/local/pgsql/bin:${workspaceFolder}/Debug/src/test/regress:${env:PATH}",
        "LD_LIBRARY_PATH": "${workspaceFolder}/Debug/tmp_install/usr/local/pgsql/lib"
      }
    }
  ]
}
  • patch a few files to make it easier to attach a debugger to forked processes
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index fda2e9360e..37584239c2 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -956,6 +956,16 @@ pg_plan_queries(List *querytrees, const char *query_string, int cursorOptions,
 static void
 exec_simple_query(const char *query_string)
 {
+	// Allow attaching for specific SQL
+	if (getenv("DEV_Q") != NULL && strncmp(query_string, getenv("DEV_Q"), strlen(getenv("DEV_Q"))) == 0) {
+		printf("[backend(pid = %d)] Sleeping to give some time for debugger to attach", getpid());
+		for (int i = 0; i < 10; i++) {
+			fflush(stdout);
+			pg_usleep(1000000L);
+			printf(".");
+		}
+		puts("");
+	}
 	CommandDest dest = whereToSendOutput;
 	MemoryContext oldcontext;
 	List	   *parsetree_list;
diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index e6f71c7582..6243bf78f7 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -2457,6 +2457,16 @@ regression_main(int argc, char *argv[],
 #endif
 		printf(_("running on port %d with PID %lu\n"),
 			   port, ULONGPID(postmaster_pid));
+
+		if (getenv("DEV_M") != NULL) {
+			printf("Sleeping to give some time for debugger to attach to postmaster");
+			for (int i = 0; i < 10; i++) {
+				fflush(stdout);
+				pg_usleep(1000000L);
+				printf(".");
+			}
+			puts("");
+		}
 	}
 	else
 	{
  • debugging workflow
#
# Attach to a postmaster process
#
$ DEV_M=1 make check-tests EXTRA_TESTS=__dev__
...
============== starting postmaster                    ==============
running on port 51696 with PID 129962
Sleeping to give some time for debugger to attach to postmaster....
# **** At this moment, press F5 to debug into `postgres` process


#
# Attach to a backend process handling specific SQL
#
$ DEV_Q='CREATE TABLE dev_t' make check-tests EXTRA_TESTS="__dev__"
...
============== running regression test queries        ==============
test __dev__                      ... ok        59132 ms

# **** At this moment, look for the process id in Debug/src/test/regress/log/postmaster.log e.g.
# ****   [backend(pid = 202932)] Sleeping to give some time for debugger to attach..........
# **** Based on the log, attach to the backend process.
  • sample sql file to debug into
-- src/test/regress/sql/__dev__.sql

-- Run by `make check-tests EXTRA_TESTS="__dev__"`
-- Update snapshot by `cp -f src/test/regress/results/__dev__.out ../src/test/regress/expected/__dev__.out`

CREATE TABLE dev_t (col1 INTEGER);
INSERT INTO dev_t VALUES (123);
SELECT * FROM dev_t;

main loop

main => PostmasterMain => ServerLoop =>
  loop
    select
    if new connection
      ConnCreate
      BackendStartup =>
        fork_process
        (child) BackendRun => PostgresMain => ...


PostgresMain(dbname, ...) =>
  InitPostgres
  loop
    send ReadyForQuery message if send_ready_for_query
    initStringInfo(&input_message)
    ReadCommand(&input_message)
    # Switch by command (e.g. 'Q' for "simple query")
    # cf.
    # - https://www.postgresql.org/docs/14/protocol-flow.html
    # - https://www.postgresql.org/docs/14/protocol-message-formats.html
    (if 'Q')
      exec_simple_query => ...
      send_ready_for_query = true

SQL processing

exec_simple_query(query_string) =>
  start_xact_command => ???
  pg_parse_query => raw_parser (see scan.l and gram.y)
  foreach(parsetree_item, parsetree_list) (process each statement separated by semicolon)
    CreateCommandTag (see cmdtaglist.h (e.g. "CREATE DATABASE", "SELECT", ...))
    pg_analyze_and_rewrite (RawStmt/Node -> Query) => ...
    pg_plan_queries (Query -> PlannedStmt) => ...
    (execution routines)
      CreatePortal ... PortalRun
    send EndCommand message
  finish_xact_command => ???


pg_analyze_and_rewrite =>
  parse_analyze =>
    make_parsestate (allocate `ParseState`)
    transformTopLevelStmt => transformOptionalSelectInto =>
      transformStmt =>
        (e.g. transformSelectStmt, transformInsertStmt, transformDeleteStmt, ...)
  pg_rewrite_query


pg_plan_queries => ...


(execution)
  CreatePortal
  PortalDefineQuery (set PlannedStmt to Portal)
  PortalStart =>
    ChoosePortalStrategy (e.g. PORTAL_ONE_SELECT, PORTAL_MULTI_QUERY)
  CreateDestReceiver
  PortalRun =>
    switch by portal->strategy
    (if PORTAL_ONE_SELECT)
      PortalRunSelect => ...
    (if PORTAL_MULTI_QUERY)
      PortalRunMulti => ...
pg_analyze_and_rewrite => ... => transformStmt =>
  no transformationn (Query with CMD_UTILITY and utilityStmt as an original Node)

pg_plan_queries =>
  no plan (PlannedStmt with CMD_UTILITY and utilityStmt from Query)

PortalRun => PortalRunMulti => PortalRunUtility => ProcessUtility(PlanedStmt) =>
  standard_ProcessUtility =>
    # Switch by PlannedStmt.utilityStmt.type
    (if T_CreatedbStmt) createdb(ParseState, CreatedbStmt) =>
      # fetch template db metadata to copy from
      get_db_info(dbtemplate, ...) => ...

      # open `pg_database` to write a new entry
      table_open(DatabaseRelationId, ...)
      GetNewOidWithIndex (with pg_database_oid_index) => ???
      copy template db metadata to `Datum[Natts_pg_database]` (which corresponds to `Form_pg_database`)
      heap_form_tuple (Datum[] -> HeapTuple)
      CatalogTupleInsert => ...

      # finally setup directory structure based on pg_tablespace
      table_open(TableSpaceRelationId, ...)
      copydir ...

    (default)
      ProcessUtilitySlow =>
        (if T_CreateStmt)
          transformCreateStmt =>
            RangeVarGetAndCheckCreationNamespace
            for each element in stmt->tableElts (T_ColumnDef, T_Constraint, etc...)
              transformColumnDefinition
          DefineRelation =>
            BuildDescForRelation (List<ColumnDef> -> TupleDesc)
            set `default_table_access_method` as `accessMethod`
            heap_create_with_catalog => (TODO: main stuff)


# essentially `SELECT * FROM pg_database WHERE datname = 'template0'`
get_db_info =>
  # opening system catalog `pg_database`
  table_open(DatabaseRelationId, ...) => relation_open =>
    RelationIdGetRelation (OiD -> Relation) => ???

  # `F_NAMEEQ` is a builtin function OID in `pg_proc`
  ScanKeyInit(..., Anum_pg_database_datname, BTEqualStrategyNumber, F_NAMEEQ, ...) =>
    fmgr_info(Oid,  FmgrInfo*) => fmgr_info_cxt_security =>
      fmgr_isbuiltin (look up `fmgr_builtin_oid_index` and `fmgr_builtins` (auto-generated based on pg_proc.dat))

  # scan using index `DatabaseNameIndexId` (aka `pg_database_datname_index`)
  systable_beginscan =>
    index_open => relation_open => ...
    table_slot_create (TupleTableSlot from Relation's TupleDesc)
    index_beginscan => index_beginscan_internal =>
      # amapi `btbeginscan` for `pg_database_datname_index`
      IndexAmRoutine.ambeginscan

  # `HeapTuple` is the entry of `pg_database` (i.e. `Form_pg_database`)
  systable_getnext (SysScanDesc -> HeapTuple) =>
    index_getnext_slot =>
      index_getnext_tid =>
        IndexAmRoutine.amgettuple (e.g. btgettuple)
      index_fetch_heap =>
        table_index_fetch_tuple =>
          # fetch to TupleTableSlot
          TableAmRoutine.index_fetch_tuple (e.g. heapam_index_fetch_tuple)
    ExecFetchSlotHeapTuple (TupleTableSlot -> HeapTuple)

  # here actually refetch `Form_pg_database` based on `Form_pg_database.oid` obtained via index above
  SearchSysCache1(DATABASEOID, ...)


# utility for inserting to system catalog
CatalogTupleInsert => ???


--------------------
-- data structure --
--------------------


CreateStmt
  List<ColumnDef> tableElts

ObjectAddress
pg_analyze_and_rewrite => ... => transformStmt =>
  transformInsertStmt =>
    setTargetTable =>
      parserOpenTable => table_openrv_extended => relation_openrv_extended =>
        RangeVarGetRelid => RelnameGetRelid =>
          get_relname_relid => GetSysCacheOid2 (TODO: is this looking up pg_class when cache not hit?)
        relation_open (by OID) => ...
      addRangeTableEntryForRelation =>
        instantiate `RangeTblEntry` and push to `ParserState.pr_rtable`
        buildRelationAliases
        buildNSItemFromTupleDesc
    checkInsertTargets =>
      (if no explicit columns arguments for `INSERT`)
        initialize `List<ResTarget>` based on `Relation.rd_att (TupleDesc)`
      (otherwise validate arguments)
    (if `INSERT ... SELECT`)
      TODO
    (if `INSERT ... VALUES`)
      transformExpressionList(... EXPR_KIND_VALUES_SINGLE) =>
        transformExpr => transformExprRecurse => (e.g. make_const)
      transformInsertRow => transformAssignedExpr
    makeTargetEntry and append to `Query.targetList`


pg_plan_query => planner => standard_planner =>
  subquery_planner (Query -> PlannerInfo) =>
    (TODO)
    grouping_planner =>
      (TODO)
      if commandType != CMD_SELECT
        create_modifytable_path => ???
      add_path
    fetch_upper_rel => ???
    set_cheapest => ???
  fetch_upper_rel (PlannerInfo -> RelOptInfo) => ...
  get_cheapest_fractional_path (RelOptInfo -> Path) => ???
  create_plan ((PlannerInfo, Path) -> Plan) =>
    create_plan_recurse =>
      # switch by Path.pathtype (e.g. T_ModifyTable)
      create_modifytable_plan =>
        create_plan_recurse (for subpath)
        make_modifytable


PortalStart => ChoosePortalStrategy (PORTAL_MULTI_QUERY)

PortalRun => PortalRunMulti => ProcessQuery =>
  CreateQueryDesc
  ExecutorStart => standard_ExecutorStart =>
    CreateExecutorState
    InitPlan =>
      ExecInitNode(Plan, ...) =>
        # Switch by Plan.tag (e.g. T_ModifyTable)
        ExecInitModifyTable =>
          (TODO)
          ExecInitResultRelation => ???
          ExecInitNode (recursively for subplan)
  ExecutorRun => standard_ExecutorRun => ExecutePlan =>
    loop
      ExecProcNode => ExecProcNodeFirst => ExecModifyTable =>
        loop
          ExecProcNode (recursive call for subplan to obtain `TupleTableSlot` to insert e.g. via `ExecResult`)
          (break if subplan returns Null Tuple)
          # Switch by operation CMD_INSERT/UPDATE/DELETE
          (if CMD_INSERT)
            ExecInitInsertProjection
            ExecGetInsertNewTuple => ExecCopySlot ...
            ExecInsert(ModifyTableState, ...) =>
              table_tuple_insert => TableAmRoutine.tuple_insert of Relation (e.g. heapam_tuple_insert)
              ExecInsertIndexTuples(ResultRelInfo, ...) => ???
              ExecProcessReturning => ???
          (if RETURNING) return result slot (and `ExecModifyTable` will be called again by the loop in `ExecutePlan`)
          (otherwise) continue the loop
      DestReceiver.receiveSlot (for RETURNING) => ???
  SetQueryCompletion


ExecResult => ExecProject =>
  ExecEvalExprSwitchContext =>
     ExecInterpExprStillValid (as ExprState.evalfunc) => ExecInterpExpr =>
      # Switch by opcode (TODO: when did we compile to the interpreter code?)
      (if EEOP_CONST) copy ExprEvalStep.constval.value (= 123)
      (if EEOP_ASSIGN_TMP) set to ExprState.resultslot.tss_values


--------------------
-- data structure --
--------------------

-- (parse/analysis) --

InsertStmt
  RangeVar
    relname
  List<ResTarget> cols

ParseState
  Relation p_target_relation
  ParseNamespaceItem p_target_nsitem
  List<RangeTblEntry> p_rtable

Query
  int resultRelation
  List<TargetEntry>
  List<RangeTblEntry>

RangeTblEntry
  RTEKind (e.g. RTE_RELATION, RTE_JOIN)


-- (planning) --

PlannerGlobal

PlannerInfo
  Relids all_result_relids, leaf_result_relids

PlannedStmt
  planTree Plan
  List<RangeTblEntry>

Path
  NodeTag

ModifyTablePath < Path

Plan
  NodeTag

ModifyTable < Plan

PlanState
  ExecProcNode (function pointer)
  ExprContext
  ProjectionInfo
    ExprState
      ExprStateEvalFunc (e.g. ExecInterpExpr)
      ExprEvalStep
        opcode
        union (depends on opcode)
      TupleTableSlot resultslot

ModifyTableState < PlanState
  CmdType (INSERT/UPDATE/DLETE)
  ResultRelInfo
    TupleTableSlot ri_newTupleSlot
    int ri_NumIndices (the number of indeces to be updated for each tuple change)
    ProjectionInfo ri_projectReturning

ResultState < PlanState


-- (execution) --

Portal
  PortalStrategy

QueryDesc
  PlannedStmt

EState (aka execution state)
  CommandId es_output_cid (for transaction?)

TupleTableSlot
  Datum tts_values
  bool tts_isnull
  Oid tts_tableOid
transformSelectStmt(ParseState, SelectStmt) =>
  set CMD_SELECT as Query.commandType
  transformFromClause =>
    for each "fromItem"
      transformFromClauseItem =>
        (if RangeVar)
          transformTableEntry(ParseState, RangeVar) =>
            addRangeTableEntry =>
              (cf. addRangeTableEntryForRelation)
              parserOpenTable
              append RangeTblEntry to ParserState.p_rtable
          return RangeTblRef
        (if JoinExpr)
          TODO
          transformFromClauseItem (recursively for joining table)
          transformJoinOnClause => ???
      append to ParseState.p_joinlist and p_namespace
  transformTargetList =>
    (if "*") ExpandColumnRefStar
    (othewise) transformTargetEntry
  qual = transformWhereClause => transformExpr => ...
  Query.jointree from ParseState.p_joinlist and qual


standard_planner => ???


PortalStart =>
  ChoosePortalStrategy (PORTAL_ONE_SELECT)
  (if PORTAL_ONE_SELECT)
    CreateQueryDesc
    ExecutorStart => ... => ExecInitNode =>
      (if T_SeqScan)
        ExecInitSeqScan =>
          (TODO)
          initialize SeqScanState
          ExecOpenScanRelation
          ExecInitScanTupleSlot


PortalRun => PortalRunSelect => ExecutorRun => standard_ExecutorRun => ExecutePlan =>
  loop
    ExecProcNode => ... => ExecSeqScan =>
      (this is a common routine for all scans (e.g. ExecIndexScan))
      ExecScan(ScanState, SeqNext, SeqRecheck) =>
        loop
          ExecScanFetch =>
            (TODO: EvalPlanQual recheck)
            SeqNext (as implementation of accessMtd) =>
              table_beginscan(Relation, ...) => TableAmRoutine.scan_begin (e.g. heap_beginscan)
              table_scan_getnextslot => TableAmRoutine.scan_getnextslot (e.g. heap_getnextslot)
          ExecQual (check if fetched tuple passes WHERE condition) => ExecEvalExprSwitchContext
          ExecProject


--------------------
-- data structure --
--------------------

SelectStmt
  List<ResTarget> targetList
  List fromClause (e.g. RangeVar, JoinExpr)
  Node whereClause

JoinExpr
  JoinType
  Node larg, rarg (TODO: what's `rarg`???)
  Node* quals (ON expression)

ParseState
  p_joinlist (e.g. RangeTblRef, ???)

Query
  FromExpr jointree
    fromlist (ParseState.joinlist)
    Node quals (WHERE expression)

SeqScan < Scan < Plan

SeqScanState < ScanState < PlanState

misc

  • GIN (generalized inverted index)
    • debug via e.g. DEV_Q="insert into t_gin_test_tbl" make check-tests EXTRA_TESTS="gin"
    • multicolumn index
    • array intersection
      • Q. Doesn't it require multiple B-tree traversal to test each array element in the query? (essentially it's OR of multiple single element containment)
    • B-tree traversal and key comparison
diff --git a/src/test/regress/sql/gin.sql b/src/test/regress/sql/gin.sql
index 5194afcc1f..276b1bc67d 100644
--- a/src/test/regress/sql/gin.sql
+++ b/src/test/regress/sql/gin.sql
@@ -73,6 +73,16 @@ select * from t_gin_test_tbl where array[0] <@ i;
 select * from t_gin_test_tbl where array[0] <@ i;
 select * from t_gin_test_tbl where array[0] <@ i and '{}'::int4[] <@ j;

+explain (costs off)
+select * from t_gin_test_tbl where array[1] <@ i and array[10] <@ j;
+
+select * from t_gin_test_tbl where array[1] <@ i and array[10] <@ j;
+
+explain (costs off)
+select * from t_gin_test_tbl where array[1, 2] && i and array[10] && j;
+
+select * from t_gin_test_tbl where array[1, 2] && i and array[10] && j;
+
 explain (costs off)
 select * from t_gin_test_tbl where i @> '{}';
 select * from t_gin_test_tbl where i @> '{}';
diff -U3 /home/hiroshi/code/others/postgres/src/test/regress/expected/gin.out /home/hiroshi/code/others/postgres/Debug/src/test/regress/results/gin.out
--- /home/hiroshi/code/others/postgres/src/test/regress/expected/gin.out	2022-01-31 23:44:33.341760521 +0900
+++ /home/hiroshi/code/others/postgres/Debug/src/test/regress/results/gin.out	2022-02-20 13:01:38.159916324 +0900
@@ -110,6 +110,41 @@
 (0 rows)

 explain (costs off)
+select * from t_gin_test_tbl where array[1] <@ i and array[10] <@ j;
+                                 QUERY PLAN
+----------------------------------------------------------------------------
+ Bitmap Heap Scan on t_gin_test_tbl
+   Recheck Cond: (('{1}'::integer[] <@ i) AND ('{10}'::integer[] <@ j))
+   ->  Bitmap Index Scan on t_gin_test_tbl_i_j_idx
+         Index Cond: ((i @> '{1}'::integer[]) AND (j @> '{10}'::integer[]))
+(4 rows)
+
+select * from t_gin_test_tbl where array[1] <@ i and array[10] <@ j;
+   i   |  j
+-------+------
+ {1,2} | {10}
+ {1,1} | {10}
+(2 rows)
+
+explain (costs off)
+select * from t_gin_test_tbl where array[1, 2] && i and array[10] && j;
+                                  QUERY PLAN
+------------------------------------------------------------------------------
+ Bitmap Heap Scan on t_gin_test_tbl
+   Recheck Cond: (('{1,2}'::integer[] && i) AND ('{10}'::integer[] && j))
+   ->  Bitmap Index Scan on t_gin_test_tbl_i_j_idx
+         Index Cond: ((i && '{1,2}'::integer[]) AND (j && '{10}'::integer[]))
+(4 rows)
+
+select * from t_gin_test_tbl where array[1, 2] && i and array[10] && j;
+   i   |  j
+-------+------
+ {1,2} | {10}
+ {2}   | {10}
+ {1,1} | {10}
+(3 rows)
+
+explain (costs off)
 select * from t_gin_test_tbl where i @> '{}';
                     QUERY PLAN
 ---------------------------------------------------
---------------------------------
-- insert (index construction) --
---------------------------------

ExecInsert(ModifyTableState, ...) =>
  table_tuple_insert
  ExecInsertIndexTuples(ResultRelInfo, ...) =>
    for each index relation in ResultRelInfo.ri_IndexRelationDescs
      (for partial index) skip this index if ExecQual is evaluated to false
      FormIndexDatum (exact `values` and `isnull` from the tuple for the indexed columns)
      index_insert(Relation, Datum* values, bool* isnull, ItemPointer, ...) =>
        gininsert (as IndexAmRoutine.aminsert) =>
          (on the first use of GIN routine)
            palloc(GinState)
            initGinState
          (if fast update mode)
            ginHeapTupleFastCollect =>
              ginExtractEntries =>
                FunctionCall3Coll (call `extractValueFn` of the data type e.g. `ginarrayextract`)
              for each entry
                GinFormTuple =>
                  index_form_tuple(TupleDesc, values, isnull) (construct IndexTuple with given data and description)
            ginHeapTupleFastInsert =>
              append collected `IndexTuple` to pending list
              (when pending list is too long, move entries to B-tree)
                ginInsertCleanup => ginEntryInsert => ...


--------------------------------
-- select (bitmap index scan) --
--------------------------------

(plan)
query_planner => ... => create_index_path => cost_index(IndexPath, PlannerInfo, ...) =>
  gincostestimate (as IndexOptInfo.amcostestimate) =>
    for each IndexClause in IndexPath.indexclauses
      for each RestrictInfo in IndexClause.indexquals
        gincost_opexpr (count the number of GIN keys for constant operand in the query) =>
          gincost_pattern =>
            get_op_opfamily_properties
            get_opfamily_proc
            FunctionCall7Coll => ginqueryarrayextract (returns the number of keys)


(execute)
ExecInitBitmapHeapScan => ... => ginbeginscan =>
  RelationGetIndexScan
  palloc(GinScanOpaqueData)
  initGinState(GinState, Relation)


ExecBitmapHeapScan => ExecScan(..., BitmapHeapNext, BitmapHeapRecheck)


BitmapHeapNext =>
  (on initialization)
    MultiExecProcNode => MultiExecBitmapIndexScan =>
      index_getbitmap(IndexScanDesc, TIDBitmap) =>
        gingetbitmap (as IndexAmRoutine.amgetbitmap) => ...
    tbm_begin_iterate
  loop
    (if TBMIterateResult is NULL)
      tbm_iterate (find next TBMIterateResult from TIDBitmap)
      (if still NULL) break
    table_scan_bitmap_next_tuple => TableAmRoutine.scan_bitmap_next_tuple
    return fetched tuple


gingetbitmap =>
  ginNewScanKey =>
    for each index condition
      extractQueryFn (e.g. ginarrayextract)
      ginFillScanKey =>
        for each extracted key
          ginFillScanEntry
  scanPendingInsert =>
    loop fast-update pending list
      collectMatchesForHeapRow (initialize data to pass into "consistent" function) =>
        for each index condition
          for each extracted key
            PageGetItem
            gintuple_get_key
            ginCompareEntries (compare query key to gin tuple data)
      directBoolConsistentFn (call e.g. ginarrayconsistent)
      (if consistent) tbm_add_tuples
  startScan => ???
  loop
    scanGetItem =>
      loop until match
        for each index condition
          keyGetItem =>
            for each extracted key
              entryGetItem => ???
          (if not match) continue
          ItemPointerSet
    (if no item) break
    tbm_add_tuples(TIDBitmap, ItemPointerData, ...)


--------------------
-- data structure --
--------------------

ResultRelInfo
  int ri_NumIndices (the number of indices needed to updated for this result tuple)
  Relation* ri_IndexRelationDescs
  IndexInfo** ri_IndexRelationInfo
    void* ii_AmCache (internal private data for index implementation (e.g. GinState))


IndexPath < Path
  List<IndexClause> indexclauses
    AttrNumber indexcol
    List<RestrictInfo> indexquals
      Expr clause
  List<???> indexorderbys, indexorderbycols
  IndexOptInfo
    amcostestimate (implementation specific index-scan cost estimator)


BitmapHeapPath < Path
  Path* bitmapqual (e.g. IndexPath, BItmapAndPath, BitmapOrPath)


BitmapHeapScanState < ScanState
  TIDBitmap* tbm
  TBMIterator* tbmiterator
    TIDBitmap* tbm
    TBMIterateResult output
  TBMIterateResult* tbmres (a pointer to multiple TIDs)
    BlockNumber blockno
    OffsetNumber offsets[]


IndexScanDescData
  void* opaque (e.g. GinScanOpaque)
    GinState
    GinScanKey keys (key for each WHERE index condition)
      GinScanEntry* requiredEntries (entry for each GIN key condition (e.g. `col @> array[1, 2]` will create two entries))
      GinTernaryValue* entryRes (aka `bool check[]` arguments for "consistent" gin api)
    GinScanEntry entries


GinState
  Relation index
  bool oneCol (false for composite index)
  TupleDesc tupdesc[] (tuple description used for a B-tree of GIN keys)
  FmgrInfo compareFn[], extractValueFn[], ... (GIN operators for each column when composite index)


GinTupleCollector
  IndexTuple* tuples


IndexTuple (the extra data is allocated to keep the actual tuple and the overall byte size will be tracked in t_info)
  ItemPointerData (pointer to heap data)
  uint16_t t_info (lower 12 bits cointans the actual size)
  • JOIN
    • debug via make check-tests EXTRA_TESTS="test_setup create_misc create_index join"
ExecNestLoop => ???
(analyse)
transformStmt => transformExplainStmt =>
  transformOptionalSelectInto (recursively for the query to be explained) => ...


(execution as CMD_UTILITY)
standard_processUtility => ExplainQuery =>
  NewExplainState
  QueryRewrite
  ExplainOneQuery(Query, ...) =>
    pg_plan_query
    ExplainOnePlan(PlannedStmt, ...) =>
      CreateQueryDesc
      ExecutorStart
      (if "ANALYZE")
        ExecutorRun
        ExecutorFinish
      ExplainPrintPlan(ExplainState, QueryDesc) =>
        ExplainPreScanNode (traverse plan tree to find the referenced RTE)
        ExplainNode(PlanState, ..., ExplainState) =>
          main routine (1K LOC) to emit information based on NodeTag
          recursively call ExplainNode for child plans (`lfettree` and `righttree`) if any

--------------------
-- data structure --
--------------------

ExplainStmt
  Node* query

ExplainState
  bool analyze
heapam_tuple_insert(Relation, TupleTableSlot, ...) =>
  ExecFetchSlotHeapTuple(TupleTableSlot, ...) -> HeapTuple
  heap_insert(Relation, HeapTuple) =>
    GetCurrentTransactionId (from global context)
    heap_prepare_insert
    RelationGetBufferForTuple => ???
    RelationPutHeapTuple(Relation, Buffer, HeapTuple) =>
      BufferGetPage
      PageAddItem/PageAddItemExtended => ???
      ItemPointerSet (set HeapTuple.t_self)
    (TODO: XLOG stuff)
  ItemPointerCopy (from inserted HeapTuple to TupleTableSlot)


heap_beginscan =>
  palloc HeapScanDesc
  initscan


heap_getnextslot =>
  heapgettup_pagemode => ???
  ExecStoreBufferHeapTuple (copy from `HeapScanDesc.rs_ctup` to result slot `TupleTableSlot`)


--------------------
-- data structure --
--------------------

HeapTuple
  ItemPointerData t_self (a pair of block index and offset)
  HeapTupleHeader
    ItemPointerData

HeapScanDesc < TableScanDesc
  HeapTupleData rs_ctup (currently referenced tuple during scan)
  • instantiate Relation from Oid
relation_open(Oid, ...) => RelationIdGetRelation
  RelationIdCacheLookup (return if cache hit)
  RelationBuildDesc =>
    ScanPgRelation (find HeapTuple/Form_pg_class from `pg_class`)
    (if Form_pg_class.relkind == RELKIND_INDEX, then setup Relation.rd_indam)
      RelationInitIndexAccessInfo =>
        SearchSysCache1(INDEXRELID, ...) (fetch pg_index)
        SearchSysCache1(AMOID, ...) (fetch pg_am with pg_class.relam OID)
        InitIndexAmRoutine =>
          GetIndexAmRoutine =>
            OidFunctionCall0 (calling fmgr-based function amhandler) =>
              fmgr_info => ...
              FunctionCall0Coll =>
                InitFunctionCallInfoData
                FunctionCallInvoke (e.g. bthandler to initialize IndexAmRoutine)
        IndexSupportInitialize => ???

    (if Form_pg_class.relkind == RELKIND_RELATION, then setup Relation.rd_tableam)
      RelationInitTableAccessMethod =>
        (if IsCatalogRelation) use F_HEAP_TABLEAM_HANDLER constant as amhandler OID
        (otherwise) fetch pg_am with pg_class.relam OID
        GetTableAmRoutine (call e.g. heap_tableam_handler to initialize TableAmRoutine)

    RelationCacheInsert
  • misc data structure
SysScanDesc
  Relation heap_rel (e.g. pg_database)
  Relation irel     (e.g. pg_database_datname_index)
  IndexScanDesc
  TupleTableSlot
    Datum tts_values
    bool tts_isnull


IndexScanDesc
  ItemPointerData xs_heaptid (matching result is kept here during scan)
  IndexFetchTableData* xs_heapfetch


Relation
  RelFileNode
  Form_pg_class
  Oid rd_amhandler (pg_proc oid for index access method handler (e.g. bthandler) or table access method (e.g. heap_tableam_handler))
  TupleDesc
  # for table
  TableAmRoutine
  # for index
  Form_pg_index
  IndexAmRoutine


PlanState
  ExprState* qual
  PlanState* lefttree, righttree (sub plan trees)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment