File page_buffer.c¶
File List > cubrid > src > storage > page_buffer.c
Go to the documentation of this file
/*
* Copyright 2008 Search Solution Corporation
* Copyright 2016 CUBRID Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
/*
* page_buffer.c - Page buffer management module (at the server)
*/
#ident "$Id$"
#include "config.h"
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <assert.h>
#include <atomic>
#include "page_buffer.h"
#include "storage_common.h"
#include "memory_alloc.h"
#include "system_parameter.h"
#include "error_manager.h"
#include "file_io.h"
#include "lockfree_circular_queue.hpp"
#include "log_append.hpp"
#include "log_manager.h"
#include "log_impl.h"
#include "log_volids.hpp"
#include "transaction_sr.h"
#include "memory_hash.h"
#include "critical_section.h"
#include "perf_monitor.h"
#include "porting_inline.hpp"
#include "environment_variable.h"
#include "thread_looper.hpp"
#if defined (SERVER_MODE)
#include "thread_daemon.hpp"
#endif
#include "thread_entry_task.hpp"
#include "thread_manager.hpp"
#include "list_file.h"
#include "tsc_timer.h"
#include "query_manager.h"
#include "xserver_interface.h"
#include "btree_load.h"
#include "boot_sr.h"
#include "double_write_buffer.hpp"
#include "resource_tracker.hpp"
#include "tde.h"
#include "show_scan.h"
#include "numeric_opfunc.h"
#include "dbtype.h"
#include "scope_exit.hpp"
#if defined(SERVER_MODE)
#include "connection_error.h"
#endif /* SERVER_MODE */
#if defined(ENABLE_SYSTEMTAP)
#include "probes.h"
#endif /* ENABLE_SYSTEMTAP */
#include "thread_entry.hpp"
// XXX: SHOULD BE THE LAST INCLUDE HEADER
#include "memory_wrapper.hpp"
const VPID vpid_Null_vpid = { NULL_PAGEID, NULL_VOLID };
/* minimum number of buffers */
#define PGBUF_MINIMUM_BUFFERS (MAX_NTRANS * 10)
/* BCB holder list related constants */
/* Each thread has its own free BCB holder list.
The list has PGBUF_DEFAULT_FIX_COUNT entries by default. */
#define PGBUF_DEFAULT_FIX_COUNT 7
/* Each BCB holder array, that is allocated from OS,
has PGBUF_NUM_ALLOC_HOLDER elements(BCB holder entries). */
#define PGBUF_NUM_ALLOC_HOLDER 10
#if !defined(SERVER_MODE)
/* TODO: do we need to do this? */
#define pthread_mutex_init(a, b)
#define pthread_mutex_destroy(a)
#define pthread_mutex_lock(a) 0
#define pthread_mutex_unlock(a)
static int rv;
#endif /* !SERVER_MODE */
/* default timeout seconds for infinite wait */
#define PGBUF_FIX_COUNT_THRESHOLD 64 /* fix count threshold. used as indicator for hot pages. */
static int pgbuf_latch_timeout = 300 * 1000; /* timeout seconds */
/* size of io page */
#if defined(CUBRID_DEBUG)
#define SIZEOF_IOPAGE_PAGESIZE_AND_GUARD() (IO_PAGESIZE + sizeof (pgbuf_Guard))
#else /* CUBRID_DEBUG */
#define SIZEOF_IOPAGE_PAGESIZE_AND_GUARD() (IO_PAGESIZE)
#endif /* CUBRID_DEBUG */
/* size of one buffer page <BCB, page> */
#define PGBUF_BCB_SIZEOF (sizeof (PGBUF_BCB))
#define PGBUF_IOPAGE_BUFFER_SIZE \
((size_t)(offsetof (PGBUF_IOPAGE_BUFFER, iopage) + \
SIZEOF_IOPAGE_PAGESIZE_AND_GUARD()))
/* size of buffer hash entry */
#define PGBUF_BUFFER_HASH_SIZEOF (sizeof (PGBUF_BUFFER_HASH))
/* size of buffer lock record */
#define PGBUF_BUFFER_LOCK_SIZEOF (sizeof (PGBUF_BUFFER_LOCK))
/* size of one LRU list structure */
#define PGBUF_LRU_LIST_SIZEOF (sizeof (PGBUF_LRU_LIST))
/* size of BCB holder entry */
#define PGBUF_HOLDER_SIZEOF (sizeof (PGBUF_HOLDER))
/* size of BCB holder array that is allocated in one time */
#define PGBUF_HOLDER_SET_SIZEOF (sizeof (PGBUF_HOLDER_SET))
/* size of BCB holder anchor */
#define PGBUF_HOLDER_ANCHOR_SIZEOF (sizeof (PGBUF_HOLDER_ANCHOR))
/* get memory address(pointer) */
#define PGBUF_FIND_BCB_PTR(i) \
((PGBUF_BCB *) ((char *) &(pgbuf_Pool.BCB_table[0]) + (PGBUF_BCB_SIZEOF * (i))))
#define PGBUF_FIND_IOPAGE_PTR(i) \
((PGBUF_IOPAGE_BUFFER *) ((char *) &(pgbuf_Pool.iopage_table[0]) + (PGBUF_IOPAGE_BUFFER_SIZE * (i))))
#define PGBUF_FIND_IOPAGE_PTR_FROM_EXTERNAL_ALLOCATE(alloc_memory, i) \
((PGBUF_IOPAGE_BUFFER *) ((char *) (alloc_memory) + (PGBUF_IOPAGE_BUFFER_SIZE * (i))))
#define PGBUF_FIND_BUFFER_GUARD(bufptr) \
(&bufptr->iopage_buffer->iopage.page[DB_PAGESIZE])
/* macros for casting pointers */
#define CAST_PGPTR_TO_BFPTR(bufptr, pgptr) \
do { \
(bufptr) = ((PGBUF_BCB *) ((PGBUF_IOPAGE_BUFFER *) \
((char *) pgptr - offsetof (PGBUF_IOPAGE_BUFFER, iopage.page)))->bcb); \
assert ((bufptr) == (bufptr)->iopage_buffer->bcb); \
} while (0)
#define CAST_PGPTR_TO_IOPGPTR(io_pgptr, pgptr) \
do { \
(io_pgptr) = (FILEIO_PAGE *) ((char *) pgptr - offsetof (FILEIO_PAGE, page)); \
} while (0)
#define CAST_IOPGPTR_TO_PGPTR(pgptr, io_pgptr) \
do { \
(pgptr) = (PAGE_PTR) ((char *) (io_pgptr)->page); \
} while (0)
#define CAST_BFPTR_TO_PGPTR(pgptr, bufptr) \
do { \
assert ((bufptr) == (bufptr)->iopage_buffer->bcb); \
(pgptr) = ((PAGE_PTR) ((char *) (bufptr->iopage_buffer) + offsetof (PGBUF_IOPAGE_BUFFER, iopage.page))); \
} while (0)
/* check whether the given volume is auxiliary volume */
#define PGBUF_IS_AUXILIARY_VOLUME(volid) ((volid) < LOG_DBFIRST_VOLID ? true : false)
/************************************************************************/
/* Page buffer zones section */
/************************************************************************/
/* (bcb flags + zone = 2 bytes) + (lru index = 2 bytes); lru index values start from 0. */
/* if that changes, make the right updates here. */
#define PGBUF_LRU_NBITS 16
#define PGBUF_LRU_LIST_MAX_COUNT ((int) 1 << PGBUF_LRU_NBITS) /* 64k */
#define PGBUF_LRU_INDEX_MASK (PGBUF_LRU_LIST_MAX_COUNT - 1) /* 0x0000FFFF */
/* PGBUF_ZONE - enumeration with all page buffer zones */
typedef enum
{
/* zone values start after reserved values for lru indexes */
/* LRU zones explained:
* 1. This is hottest zone and this is where most fixed/unfixed bcb's are found. We'd like to keep the page unfix
* complexity to a minimum, therefore no boost to top are done here. This zone's bcb's cannot be victimized.
* 2. This is a buffer between the hot lru 1 zone and the victimization lru 3 zone. The buffer zone gives bcb's that
* fall from first zone a chance to be boosted back to top (if they are still hot). Victimization is still not
* allowed.
* 3. Third zone is the victimization zone. BCB's can still be boosted if fixed/unfixed, but in aggressive victimizing
* systems, non-dirty bcb's rarely survive here.
*/
PGBUF_LRU_1_ZONE = 1 << PGBUF_LRU_NBITS,
PGBUF_LRU_2_ZONE = 2 << PGBUF_LRU_NBITS,
PGBUF_LRU_3_ZONE = 3 << PGBUF_LRU_NBITS,
/* make sure lru zone mask covers all lru zone values */
PGBUF_LRU_ZONE_MASK = PGBUF_LRU_1_ZONE | PGBUF_LRU_2_ZONE | PGBUF_LRU_3_ZONE,
/* other zone values must have a completely different mask than lru zone. so also skip the two bits used for
* PGBUF_LRU_ZONE_MASK */
PGBUF_INVALID_ZONE = 1 << (PGBUF_LRU_NBITS + 2), /* invalid zone */
PGBUF_VOID_ZONE = 2 << (PGBUF_LRU_NBITS + 2), /* void zone: temporary zone after reading bcb from disk until and
* until adding to a lru list, or after removing from lru list and
* until victimizing. */
/* zone mask should cover all zone values */
PGBUF_ZONE_MASK = (PGBUF_LRU_ZONE_MASK | PGBUF_INVALID_ZONE | PGBUF_VOID_ZONE),
} PGBUF_ZONE;
#define PGBUF_MAKE_ZONE(list_id, zone) ((list_id) | (zone))
#define PGBUF_GET_ZONE(flags) ((PGBUF_ZONE) ((flags) & PGBUF_ZONE_MASK))
#define PGBUF_GET_LRU_INDEX(flags) ((flags) & PGBUF_LRU_INDEX_MASK)
/************************************************************************/
/* Page buffer BCB section */
/************************************************************************/
/* bcb flags */
/* dirty: false initially, is set to true when page is modified. set to false again when flushed to disk. */
#define PGBUF_BCB_DIRTY_FLAG ((int) 0x80000000)
/* is flushing: set to true when someone intends to flush the bcb to disk. dirty flag is usually set to false, but
* bcb cannot be yet victimized. flush must succeed first. */
#define PGBUF_BCB_FLUSHING_TO_DISK_FLAG ((int) 0x40000000)
/* flag to mark bcb was directly victimized. we can have certain situations when victimizations fail. the thread goes
* to sleep then and waits to be awaken by another thread, which also assigns it a bcb directly. there can be multiple
* providers of such bcb's.
* there is a small window of opportunity for active workers to fix this bcb. when fixing a direct victim, we need to
* replace the flag with PGBUF_BCB_INVALIDATE_DIRECT_VICTIM_FLAG. there is not point of victimizing this bcb to fix it
* again. The thread waiting for the bcb will know it was fixed again and will request another bcb. */
#define PGBUF_BCB_VICTIM_DIRECT_FLAG ((int) 0x20000000)
#define PGBUF_BCB_INVALIDATE_DIRECT_VICTIM_FLAG ((int) 0x10000000)
/* flag for unlatch bcb to move it to the bottom of lru when fix count is 0. usually set when page is deallocated */
#define PGBUF_BCB_MOVE_TO_LRU_BOTTOM_FLAG ((int) 0x08000000)
/* flag for pages that should be vacuumed. */
#define PGBUF_BCB_TO_VACUUM_FLAG ((int) 0x04000000)
/* flag for asynchronous flush request */
#define PGBUF_BCB_ASYNC_FLUSH_REQ ((int) 0x02000000)
/* add all flags here */
#define PGBUF_BCB_FLAGS_MASK \
(PGBUF_BCB_DIRTY_FLAG \
| PGBUF_BCB_FLUSHING_TO_DISK_FLAG \
| PGBUF_BCB_VICTIM_DIRECT_FLAG \
| PGBUF_BCB_INVALIDATE_DIRECT_VICTIM_FLAG \
| PGBUF_BCB_MOVE_TO_LRU_BOTTOM_FLAG \
| PGBUF_BCB_TO_VACUUM_FLAG \
| PGBUF_BCB_ASYNC_FLUSH_REQ)
/* add flags that invalidate a victim candidate here */
/* 1. dirty bcb's cannot be victimized.
* 2. bcb's that are in the process of being flushed cannot be victimized. flush must succeed!
* 3. bcb's that are already assigned as victims are not valid victim candidates.
*/
#define PGBUF_BCB_INVALID_VICTIM_CANDIDATE_MASK \
(PGBUF_BCB_DIRTY_FLAG \
| PGBUF_BCB_FLUSHING_TO_DISK_FLAG \
| PGBUF_BCB_VICTIM_DIRECT_FLAG \
| PGBUF_BCB_INVALIDATE_DIRECT_VICTIM_FLAG)
/* bcb has no flag initially and is in invalid zone */
#define PGBUF_BCB_INIT_FLAGS PGBUF_INVALID_ZONE
/* fix & avoid dealloc counter... we have one integer and each uses two bytes. fix counter is offset by two bytes. */
#define PGBUF_BCB_COUNT_FIX_SHIFT_BITS 16
#define PGBUF_BCB_AVOID_DEALLOC_MASK ((int) 0x0000FFFF)
/* Activity on each LRU is probed and cumulated;
* to avoid long history cumulation effect, the activity indicator is limited (PGBUF_TRAN_MAX_ACTIVITY);
* Inactivity threshold is defined: private LRU dropping beneath this threshold are destroyed and its BCBs will be
* victimized.
*/
#define PGBUF_TRAN_THRESHOLD_ACTIVITY (pgbuf_Pool.num_buffers / 4)
#define PGBUF_TRAN_MAX_ACTIVITY (10 * PGBUF_TRAN_THRESHOLD_ACTIVITY)
#define PGBUF_AOUT_NOT_FOUND -2
#if defined (SERVER_MODE)
/* vacuum workers and checkpoint thread should not contribute to promoting a bcb as active/hot */
#define PGBUF_VACUUM_SHOULD_IGNORE_UNFIX(th) VACUUM_IS_THREAD_VACUUM_WORKER (th)
#else
#define PGBUF_VACUUM_SHOULD_IGNORE_UNFIX(th) false
#endif
#if defined (SERVER_MODE)
/* vacuum workers ,checkpoint thread and temp page should not contribute to promoting a bcb as active/hot */
#define PGBUF_SHOULD_IGNORE_UNFIX(th, buf) VACUUM_IS_THREAD_VACUUM_WORKER (th) || pgbuf_is_temporary_volume (buf->vpid.volid)
#else
#define PGBUF_SHOULD_IGNORE_UNFIX(th, buf) false
#endif
#define HASH_SIZE_BITS 20
#define PGBUF_HASH_SIZE (1 << HASH_SIZE_BITS)
#define UINT16MAX 65534
#define PGBUF_HASH_VALUE(vpid) pgbuf_hash_func_mirror(vpid)
/* Maximum overboost flush multiplier: controls the maximum factor to apply to configured flush ratio,
* when the miss rate (victim_request/fix_request) increases.
*/
#define PGBUF_FLUSH_VICTIM_BOOST_MULT 10
#define PGBUF_NEIGHBOR_FLUSH_NONDIRTY \
(prm_get_bool_value (PRM_ID_PB_NEIGHBOR_FLUSH_NONDIRTY))
#define PGBUF_MAX_NEIGHBOR_PAGES 32
#define PGBUF_NEIGHBOR_PAGES \
(prm_get_integer_value (PRM_ID_PB_NEIGHBOR_FLUSH_PAGES))
#define PGBUF_NEIGHBOR_POS(idx) (PGBUF_NEIGHBOR_PAGES - 1 + (idx))
/* maximum number of simultaneous fixes a thread may have on the same page */
#define PGBUF_MAX_PAGE_WATCHERS 64
/* maximum number of simultaneous fixed pages from a single thread */
#define PGBUF_MAX_PAGE_FIXED_BY_TRAN 64
/* max and min flush rate in pages/sec during checkpoint */
#define PGBUF_CHKPT_MAX_FLUSH_RATE 1200
#define PGBUF_CHKPT_MIN_FLUSH_RATE 50
/* default pages to flush in each interval during log checkpoint */
#define PGBUF_CHKPT_BURST_PAGES 16
#define INIT_HOLDER_STAT(perf_stat) \
do \
{ \
(perf_stat)->dirty_before_hold = 0; \
(perf_stat)->dirtied_by_holder = 0; \
(perf_stat)->hold_has_write_latch = 0; \
(perf_stat)->hold_has_read_latch = 0; \
} \
while (0)
/* use define PGBUF_ORDERED_DEBUG to enable extended debug for ordered fix */
// todo - is it better to replace with a system parameter?
#undef PGBUF_ORDERED_DEBUG
#define PGBUF_LRU_ZONE_MIN_RATIO 0.05f
#define PGBUF_LRU_ZONE_MAX_RATIO 0.90f
/* buffer lock return value */
enum
{
PGBUF_LOCK_WAITER = 0, PGBUF_LOCK_HOLDER
};
/* constants to indicate the content state of buffers */
enum
{
PGBUF_CONTENT_BAD = 0, /* A bug in the system */
PGBUF_CONTENT_GOOD, /* Content is consistent */
PGBUF_CONTENT_LIKELY_BAD, /* Maybe a bug in the system */
PGBUF_CONTENT_ERROR /* Some kind of error */
};
typedef struct pgbuf_holder PGBUF_HOLDER;
typedef struct pgbuf_holder_anchor PGBUF_HOLDER_ANCHOR;
typedef struct pgbuf_holder_set PGBUF_HOLDER_SET;
/* *INDENT-OFF* */
typedef std::atomic<uint64_t> PGBUF_ATOMIC_LATCH;
/* *INDENT-ON* */
typedef union pgbuf_atomic_latch_impl PGBUF_ATOMIC_LATCH_IMPL;
typedef struct pgbuf_bcb PGBUF_BCB;
typedef struct pgbuf_iopage_buffer PGBUF_IOPAGE_BUFFER;
typedef struct pgbuf_aout_buf PGBUF_AOUT_BUF;
typedef struct pgbuf_buffer_lock PGBUF_BUFFER_LOCK;
typedef struct pgbuf_buffer_hash PGBUF_BUFFER_HASH;
typedef struct pgbuf_lru_list PGBUF_LRU_LIST;
typedef struct pgbuf_aout_list PGBUF_AOUT_LIST;
typedef struct pgbuf_seq_flusher PGBUF_SEQ_FLUSHER;
typedef struct pgbuf_invalid_list PGBUF_INVALID_LIST;
typedef struct pgbuf_victim_candidate_list PGBUF_VICTIM_CANDIDATE_LIST;
typedef struct pgbuf_buffer_pool PGBUF_BUFFER_POOL;
typedef struct pgbuf_monitor_bcb_mutex PGBUF_MONITOR_BCB_MUTEX;
typedef struct pgbuf_holder_info PGBUF_HOLDER_INFO;
typedef struct pgbuf_status PGBUF_STATUS;
typedef struct pgbuf_status_snapshot PGBUF_STATUS_SNAPSHOT;
typedef struct pgbuf_status_old PGBUF_STATUS_OLD;
struct pgbuf_status
{
unsigned long long num_hit;
unsigned long long num_page_request;
unsigned long long num_pages_created;
unsigned long long num_pages_written;
unsigned long long num_pages_read;
unsigned int num_flusher_waiting_threads;
unsigned int dummy;
};
struct pgbuf_status_snapshot
{
unsigned int free_pages;
unsigned int victim_candidate_pages;
unsigned int clean_pages;
unsigned int dirty_pages;
unsigned int num_index_pages;
unsigned int num_data_pages;
unsigned int num_system_pages;
unsigned int num_temp_pages;
};
struct pgbuf_status_old
{
unsigned long long num_hit;
unsigned long long num_page_request;
unsigned long long num_pages_created;
unsigned long long num_pages_written;
unsigned long long num_pages_read;
time_t print_out_time;
};
struct pgbuf_holder_info
{
VPID vpid; /* page to which holder refers */
PGBUF_ORDERED_GROUP group_id; /* group (VPID of heap header ) of the page */
int rank; /* rank of page (PGBUF_ORDERED_RANK) */
int watch_count; /* number of watchers on this holder */
PGBUF_WATCHER *watcher[PGBUF_MAX_PAGE_WATCHERS]; /* pointers to all watchers to this holder */
PGBUF_LATCH_MODE latch_mode; /* aggregate latch mode of all watchers */
PAGE_TYPE ptype; /* page type (should be HEAP or OVERFLOW) */
bool prevent_dealloc; /* page is prevented from being deallocated. */
};
typedef struct pgbuf_holder_stat PGBUF_HOLDER_STAT;
/* Holder flags used by perf module */
struct pgbuf_holder_stat
{
unsigned dirty_before_hold:1; /* page was dirty before holder was acquired */
unsigned dirtied_by_holder:1; /* page was dirtied by holder */
unsigned hold_has_write_latch:1; /* page has/had write latch */
unsigned hold_has_read_latch:1; /* page has/had read latch */
};
typedef struct pgbuf_batch_flush_helper PGBUF_BATCH_FLUSH_HELPER;
struct pgbuf_batch_flush_helper
{
int npages;
int fwd_offset;
int back_offset;
PGBUF_BCB *pages_bufptr[2 * PGBUF_MAX_NEIGHBOR_PAGES - 1];
VPID vpids[2 * PGBUF_MAX_NEIGHBOR_PAGES - 1];
};
/* BCB holder entry */
struct pgbuf_holder
{
int fix_count; /* the count of fix by the holder */
PGBUF_BCB *bufptr; /* pointer to BCB */
PGBUF_HOLDER *thrd_link; /* the next BCB holder entry in the BCB holder list of thread */
PGBUF_HOLDER *next_holder; /* free BCB holder list of thread */
PGBUF_HOLDER_STAT perf_stat;
#if !defined(NDEBUG)
char fixed_at[64 * 1024];
int fixed_at_size;
#endif /* NDEBUG */
int watch_count;
PGBUF_WATCHER *first_watcher;
PGBUF_WATCHER *last_watcher;
};
/* thread related BCB holder list (it is owned by each thread) */
struct pgbuf_holder_anchor
{
int num_free_cnt; /* # of free BCB holder entries */
int num_hold_cnt; /* # of used BCB holder entries */
PGBUF_HOLDER *thrd_free_list; /* free BCB holder list */
PGBUF_HOLDER *thrd_hold_list; /* used(or hold) BCB holder list */
};
/* the entry(array structure) of free BCB holder list shared by threads */
struct pgbuf_holder_set
{
PGBUF_HOLDER element[PGBUF_NUM_ALLOC_HOLDER]; /* BCB holder array */
PGBUF_HOLDER_SET *next_set; /* next array */
};
union pgbuf_atomic_latch_impl
{
uint64_t raw;
struct
{
PGBUF_LATCH_MODE latch_mode;
uint16_t waiter_exists;
int32_t fcnt;
} impl;
};
/* BCB structure */
struct pgbuf_bcb
{
#if defined(SERVER_MODE)
pthread_mutex_t mutex; /* BCB mutex */
int owner_mutex; /* mutex owner */
#endif /* SERVER_MODE */
VPID vpid; /* Volume and page identifier of resident page */
PGBUF_ATOMIC_LATCH atomic_latch; /* atomic latch */
volatile int flags;
#if defined(SERVER_MODE)
THREAD_ENTRY *next_wait_thrd; /* BCB waiting queue */
#endif /* SERVER_MODE */
#if defined(SERVER_MODE)
THREAD_ENTRY *latch_last_thread; /* last thread that acquired latch */
#endif /* SERVER_MODE && !NDEBUG */
PGBUF_BCB *hash_next; /* next hash chain */
PGBUF_BCB *prev_BCB; /* prev LRU chain */
PGBUF_BCB *next_BCB; /* next LRU or Invalid(Free) chain */
int tick_lru_list; /* age of lru list when this BCB was inserted into. used to decide when bcb has aged
* enough to boost to top. */
int tick_lru3; /* position in lru zone 3. small numbers are at the bottom. used to update LRU victim
* hint. */
volatile int count_fix_and_avoid_dealloc; /* two-purpose field:
* 1. count fixes up to a threshold (to detect hot pages).
* 2. avoid deallocation count.
* we don't use two separate shorts because avoid deallocation needs to
* be changed atomically... 2-byte sized atomic operations are not
* common. */
int hit_age; /* age of last hit (used to compute activities and quotas) */
LOG_LSA oldest_unflush_lsa; /* The oldest LSA record of the page that has not been written to disk */
PGBUF_IOPAGE_BUFFER *iopage_buffer; /* pointer to iopage buffer structure */
};
/* iopage buffer structure */
struct pgbuf_iopage_buffer
{
PGBUF_BCB *bcb; /* pointer to BCB structure */
#if (__WORDSIZE == 32)
int dummy; /* for 8byte align of iopage */
#elif !defined(LINUX) && !defined(WINDOWS) && !defined(AIX)
#error "you must check that iopage is aligned by 8byte !!"
#endif
FILEIO_PAGE iopage; /* The actual buffered io page */
};
/* buffer lock record (or entry) structure
*
* buffer lock table is the array of buffer lock records
* # of buffer lock records is fixed as the total # of threads.
*/
struct pgbuf_buffer_lock
{
VPID vpid; /* buffer-locked page id */
PGBUF_BUFFER_LOCK *lock_next; /* next buffer lock record */
#if defined(SERVER_MODE)
THREAD_ENTRY *next_wait_thrd; /* buffer-lock waiting queue */
#endif /* SERVER_MODE */
};
/* buffer hash entry structure
*
* buffer hash table is the array of buffer hash entries.
*/
struct pgbuf_buffer_hash
{
#if defined(SERVER_MODE)
pthread_mutex_t hash_mutex; /* hash mutex for the integrity of buffer hash chain and buffer lock chain. */
#endif /* SERVER_MODE */
PGBUF_BCB *hash_next; /* the anchor of buffer hash chain */
PGBUF_BUFFER_LOCK *lock_next; /* the anchor of buffer lock chain */
};
/* buffer LRU list structure : double linked list */
struct pgbuf_lru_list
{
#if defined(SERVER_MODE)
pthread_mutex_t mutex; /* LRU mutex for the integrity of LRU list. */
#endif /* SERVER_MODE */
PGBUF_BCB *top; /* top of the LRU list */
PGBUF_BCB *bottom; /* bottom of the LRU list */
PGBUF_BCB *bottom_1; /* the last of LRU_1_Zone. NULL if lru1 zone is empty */
PGBUF_BCB *bottom_2; /* the last of LRU_2_Zone. NULL if lru2 zone is empty */
PGBUF_BCB *volatile victim_hint; /* hint to start searching for victims in lru list. everything below the hint
* should be dirty, but the hint is not always the first bcb that can be
* victimized. */
/* TODO: I have noticed while investigating core files from TPCC that hint is
* sometimes before first bcb that can be victimized. this means there is
* a logic error somewhere. I don't know where, but there must be. */
/* zone counters */
int count_lru1;
int count_lru2;
int count_lru3;
/* victim candidate counter */
int count_vict_cand;
/* zone thresholds. we only need for zones one and two */
int threshold_lru1;
int threshold_lru2;
/* quota (private lists only) */
int quota;
/* list tick. incremented when new bcb's are added to the list or when bcb's are boosted to top */
int tick_list; /* tick incremented whenever bcb is added or moved in list */
int tick_lru3; /* tick incremented whenever bcb's fall to zone three */
volatile int flags; /* LRU list flags */
int index; /* LRU list index */
};
/* buffer invalid BCB list : single linked list */
struct pgbuf_invalid_list
{
#if defined(SERVER_MODE)
pthread_mutex_t invalid_mutex; /* invalid mutex for the integrity of invalid BCB list. */
#endif /* SERVER_MODE */
PGBUF_BCB *invalid_top; /* top of the invalid BCB list */
int invalid_cnt; /* # of entries in invalid BCB list */
};
/* The page replacement algorithm is LRU + Aout of 2Q. This algorithm uses two linked lists as follows:
* - LRU list: this is a list of BCBs managed as a Least Recently Used queue
* - Aout list: this is a list on VPIDs managed as a FIFO queue
* The LRU list manages the "hot" pages, Aout list holds a short term history of pages which have been victimized.
*/
/* Aout list node */
struct pgbuf_aout_buf
{
VPID vpid; /* page VPID */
int lru_idx;
PGBUF_AOUT_BUF *next; /* next element in list */
PGBUF_AOUT_BUF *prev; /* prev element in list */
};
/* Aout list */
struct pgbuf_aout_list
{
#if defined(SERVER_MODE)
pthread_mutex_t Aout_mutex; /* Aout mutex for the integrity of Aout list. */
#endif /* SERVER_MODE */
PGBUF_AOUT_BUF *Aout_top; /* top of the queue */
PGBUF_AOUT_BUF *Aout_bottom; /* bottom of the queue */
PGBUF_AOUT_BUF *Aout_free; /* a free list of Aout nodes */
PGBUF_AOUT_BUF *bufarray; /* Array holding all the nodes in the list. Since Aout has a predefined fixed size, it
* makes more sense to preallocate all the nodes */
int num_hashes; /* number of hash tables */
MHT_TABLE **aout_buf_ht; /* hash table for fast history lookup. */
int max_count; /* maximum size of the Aout queue */
};
/* Generic structure to manage sequential flush with flush rate control:
* Flush rate control is achieved by breaking each 1 second into intervals, and attempt to flush an equal number of
* pages in each interval.
* Compensation is applied across all intervals in one second to achieve overall flush rate.
* In each interval, the pages are flushed either in burst mode or equally time spread during the entire interval.
*/
struct pgbuf_seq_flusher
{
PGBUF_VICTIM_CANDIDATE_LIST *flush_list; /* flush list */
LOG_LSA flush_upto_lsa; /* newest of the oldest LSA record of the pages which will be written to disk */
int control_intervals_cnt; /* intervals passed */
int control_flushed; /* number of pages flushed since the 1 second super-interval started */
int interval_msec; /* duration of one interval */
int flush_max_size; /* max size of elements, set only on init */
int flush_cnt; /* current count of elements in flush_list */
int flush_idx; /* index of current element to flush */
int flushed_pages; /* cnt of flushed pages (return parameter) */
float flush_rate; /* maximum rate of flushing (negative if none should be used) */
bool burst_mode; /* config : flush in burst or flush one page and wait */
};
typedef struct pgbuf_page_monitor PGBUF_PAGE_MONITOR;
struct pgbuf_page_monitor
{
INT64 dirties_cnt; /* Number of dirty buffers. */
int *lru_hits; /* Current hits in LRU1 per LRU */
int *lru_activity; /* Activity level per LRU */
/* Overall counters */
volatile int lru_shared_pgs_cnt; /* count of BCBs in all shared LRUs */
std::atomic_int pg_unfix_cnt; /* Count of page unfixes; used for refreshing quota adjustment */
int lru_victim_req_cnt; /* number of victim requests from all LRUs */
std::atomic_int fix_req_cnt; /* number of fix requests */
#if defined (SERVER_MODE)
PGBUF_MONITOR_BCB_MUTEX *bcb_locks; /* track bcb mutex usage. */
#endif /* SERVER_MODE */
bool victim_rich; /* true if page buffer pool has many victims. pgbuf_adjust_quotas will update this
* value. */
};
typedef struct pgbuf_page_quota PGBUF_PAGE_QUOTA;
struct pgbuf_page_quota
{
int num_private_LRU_list; /* number of private LRU lists */
/* Real-time tunning: */
float *lru_victim_flush_priority_per_lru; /* priority to flush from this LRU */
int *private_lru_session_cnt; /* Number of active session for each private LRU: Contains only private lists ! */
float private_pages_ratio; /* Ratio of all private BCBs among total BCBs */
/* TODO: remove me --> */
unsigned int add_shared_lru_idx; /* circular index of shared LRU for relocating to shared */
int avoid_shared_lru_idx; /* index of shared LRU to avoid when relocating to shared;
* this is usually the index of shared LRU with maximum number of BCBs;
* transaction will avoid this list when relocating to shared LRU (like when moving from
* a garbage LRU); such LRU list returns to normal size through victimization */
TSC_TICKS last_adjust_time;
INT32 adjust_age;
int is_adjusting;
};
#if defined (SERVER_MODE)
/* PGBUF_DIRECT_VICTIM - system used to optimize the victim assignment without searching and burning CPU uselessly.
* threads are waiting to be assigned a victim directly and woken up.
*/
typedef struct pgbuf_direct_victim PGBUF_DIRECT_VICTIM;
struct pgbuf_direct_victim
{
PGBUF_BCB **bcb_victims;
/* *INDENT-OFF* */
lockfree::circular_queue<THREAD_ENTRY *> *waiter_threads_high_priority;
lockfree::circular_queue<THREAD_ENTRY *> *waiter_threads_low_priority;
/* *INDENT-ON* */
};
#define PGBUF_FLUSHED_BCBS_BUFFER_SIZE (8 * 1024) /* 8k */
#endif /* SERVER_MODE */
/* The buffer Pool */
struct pgbuf_buffer_pool
{
/* total # of buffer frames on the buffer (fixed value: 10 * num_trans) */
int num_buffers;
/* buffer related tables and lists (the essential structures) */
PGBUF_BCB *BCB_table; /* BCB table */
PGBUF_BUFFER_HASH *buf_hash_table; /* buffer hash table */
PGBUF_BUFFER_LOCK *buf_lock_table; /* buffer lock table */
PGBUF_IOPAGE_BUFFER *iopage_table; /* IO page table */
int num_LRU_list; /* number of shared LRU lists */
float ratio_lru1; /* ratio for lru 1 zone */
float ratio_lru2; /* ratio for lru 2 zone */
PGBUF_LRU_LIST *buf_LRU_list; /* LRU lists. When Page quota is enabled, first 'num_LRU_list' store shared pages;
* the next 'num_garbage_LRU_list' lists store shared garbage pages;
* the last 'num_private_LRU_list' are private lists.
* When page quota is disabled only shared lists are used */
PGBUF_AOUT_LIST buf_AOUT_list; /* Aout list */
PGBUF_INVALID_LIST buf_invalid_list; /* buffer invalid BCB list */
PGBUF_VICTIM_CANDIDATE_LIST *victim_cand_list;
PGBUF_SEQ_FLUSHER seq_chkpt_flusher;
PGBUF_PAGE_MONITOR monitor;
PGBUF_PAGE_QUOTA quota;
/*
* the structures for maintaining information on BCB holders.
* 'thrd_holder_info' has entries as many as the # of threads and
* each entry maintains free BCB holder list and used BCB holder list
* of the corresponding thread.
* 'thrd_reserved_holder' has memory space for all BCB holder entries.
*/
PGBUF_HOLDER_ANCHOR *thrd_holder_info;
PGBUF_HOLDER *thrd_reserved_holder;
/*
* free BCB holder list shared by all the threads.
* When a thread needs more free BCB holder entries,
* the thread allocates them one by one from this list.
* However, the thread never return the entries into this list.
* The structure is a list of the arrays of BCB holder entries.
* 'free_holder_set' points to the first array that has free entries
* and 'free_index' indicates the first free entry in the array.
*/
#if defined(SERVER_MODE)
pthread_mutex_t free_holder_set_mutex;
#endif /* SERVER_MODE */
PGBUF_HOLDER_SET *free_holder_set;
int free_index;
/* 'check_for_interrupt' is set true when interrupts must be checked. Log manager set and clears this value while
* holding TR_TABLE_CS. */
bool check_for_interrupts;
#if defined(SERVER_MODE)
bool is_flushing_victims; /* flag set true when pgbuf flush thread is flushing victim candidates */
bool is_checkpoint; /* flag set true when checkpoint is running */
#endif /* SERVER_MODE */
/* *INDENT-OFF* */
#if defined (SERVER_MODE)
PGBUF_DIRECT_VICTIM direct_victims; /* direct victim assignment */
lockfree::circular_queue<PGBUF_BCB *> *flushed_bcbs; /* post-flush processing */
#endif /* SERVER_MODE */
lockfree::circular_queue<int> *private_lrus_with_victims;
lockfree::circular_queue<int> *big_private_lrus_with_victims;
lockfree::circular_queue<int> *shared_lrus_with_victims;
/* *INDENT-ON* */
PGBUF_STATUS *show_status;
PGBUF_STATUS_OLD show_status_old;
PGBUF_STATUS_SNAPSHOT show_status_snapshot;
#if defined (SERVER_MODE)
pthread_mutex_t show_status_mutex;
#endif
};
/* victim candidate list */
/* One daemon thread performs flush task for victim candidates.
* The daemon finds and saves victim candidates using following list.
* And then, based on the list, the daemon performs actual flush task.
*/
struct pgbuf_victim_candidate_list
{
PGBUF_BCB *bufptr; /* selected BCB as victim candidate */
VPID vpid; /* page id of the page managed by the BCB */
};
static PGBUF_BUFFER_POOL pgbuf_Pool; /* The buffer Pool */
static PGBUF_BATCH_FLUSH_HELPER pgbuf_Flush_helper;
HFID *pgbuf_ordered_null_hfid = NULL;
#if defined(CUBRID_DEBUG)
/* A buffer guard to detect over runs .. */
static char pgbuf_Guard[8] = { MEM_REGION_GUARD_MARK, MEM_REGION_GUARD_MARK, MEM_REGION_GUARD_MARK,
MEM_REGION_GUARD_MARK,
MEM_REGION_GUARD_MARK, MEM_REGION_GUARD_MARK, MEM_REGION_GUARD_MARK,
MEM_REGION_GUARD_MARK
};
#endif /* CUBRID_DEBUG */
#define AOUT_HASH_DIVIDE_RATIO 1000
#define AOUT_HASH_IDX(vpid, list) ((vpid)->pageid % list->num_hashes)
/* pgbuf_monitor_bcb_mutex - debug tool to monitor bcb mutex usage (and leaks). */
struct pgbuf_monitor_bcb_mutex
{
PGBUF_BCB *bcb;
PGBUF_BCB *bcb_second;
int line;
int line_second;
};
#if defined (SERVER_MODE)
static bool pgbuf_Monitor_locks = false;
#endif /* SERVER_MODE */
#if defined (SERVER_MODE)
#define PGBUF_BCB_LOCK(bcb) \
(pgbuf_Monitor_locks ? pgbuf_bcbmon_lock (bcb, __LINE__) : (void) pthread_mutex_lock (&(bcb)->mutex))
#define PGBUF_BCB_TRYLOCK(bcb) \
(pgbuf_Monitor_locks ? pgbuf_bcbmon_trylock (bcb, __LINE__) : pthread_mutex_trylock (&(bcb)->mutex))
#define PGBUF_BCB_UNLOCK(bcb) \
(pgbuf_Monitor_locks ? pgbuf_bcbmon_unlock (bcb) : (void) pthread_mutex_unlock (&(bcb)->mutex))
#define PGBUF_BCB_CHECK_OWN(bcb) if (pgbuf_Monitor_locks) pgbuf_bcbmon_check_own (bcb)
#define PGBUF_BCB_CHECK_MUTEX_LEAKS() if (pgbuf_Monitor_locks) pgbuf_bcbmon_check_mutex_leaks ()
#else /* !SERVER_MODE */ /* SA_MODE */
/* single-threaded does not require mutexes, nor does it need to check them */
#define PGBUF_BCB_LOCK(bcb)
#define PGBUF_BCB_TRYLOCK(bcb) (0)
#define PGBUF_BCB_UNLOCK(bcb)
#define PGBUF_BCB_CHECK_OWN(bcb) (true)
#define PGBUF_BCB_CHECK_MUTEX_LEAKS()
#endif /* SA_MODE */
/* helper to collect performance in page fix functions */
typedef struct pgbuf_fix_perf PGBUF_FIX_PERF;
struct pgbuf_fix_perf
{
bool is_perf_tracking;
TSC_TICKS start_tick;
TSC_TICKS end_tick;
TSC_TICKS start_holder_tick;
PERF_PAGE_MODE perf_page_found;
PERF_HOLDER_LATCH perf_latch_mode;
PERF_CONDITIONAL_FIX_TYPE perf_cond_type;
PERF_PAGE_TYPE perf_page_type;
TSCTIMEVAL tv_diff;
UINT64 lock_wait_time;
UINT64 holder_wait_time;
UINT64 fix_wait_time;
};
/* in FILEIO_PAGE_RESERVED */
typedef struct pgbuf_dealloc_undo_data PGBUF_DEALLOC_UNDO_DATA;
struct pgbuf_dealloc_undo_data
{
INT32 pageid; /* Page identifier */
INT16 volid; /* Volume identifier where the page reside */
unsigned char ptype; /* Page type */
unsigned char pflag;
};
/************************************************************************/
/* Page buffer LRU section */
/************************************************************************/
#define PGBUF_GET_LRU_LIST(lru_idx) (&pgbuf_Pool.buf_LRU_list[lru_idx])
#define PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE(bcb) (pgbuf_bcb_get_zone (bcb) == PGBUF_LRU_3_ZONE)
#define PGBUF_IS_BCB_IN_LRU(bcb) ((pgbuf_bcb_get_zone (bcb) & PGBUF_LRU_ZONE_MASK) != 0)
/* How old is a BCB (bcb_age) related to age of list to which it belongs */
#define PGBUF_AGE_DIFF(bcb_age,list_age) \
(((list_age) >= (bcb_age)) ? ((list_age) - (bcb_age)) : (DB_INT32_MAX - ((bcb_age) - (list_age))))
/* is bcb old enough. we use it as indicator of the buffer lru zone. when bcb falls more than half of this buffer zone,
* it is considered old */
#define PGBUF_IS_BCB_OLD_ENOUGH(bcb, lru_list) \
(PGBUF_AGE_DIFF ((bcb)->tick_lru_list, (lru_list)->tick_list) >= ((lru_list)->count_lru2 / 2))
/* zone counts & thresholds */
#define PGBUF_LRU_ZONE_ONE_TWO_COUNT(list) ((list)->count_lru1 + (list)->count_lru2)
#define PGBUF_LRU_LIST_COUNT(list) (PGBUF_LRU_ZONE_ONE_TWO_COUNT(list) + (list)->count_lru3)
#define PGBUF_LRU_VICTIM_ZONE_COUNT(list) ((list)->count_lru3)
#define PGBUF_LRU_IS_ZONE_ONE_OVER_THRESHOLD(list) ((list)->threshold_lru1 < (list)->count_lru1)
#define PGBUF_LRU_IS_ZONE_TWO_OVER_THRESHOLD(list) ((list)->threshold_lru2 < (list)->count_lru2)
#define PGBUF_LRU_ARE_ZONES_ONE_TWO_OVER_THRESHOLD(list) \
((list)->threshold_lru1 + (list)->threshold_lru2 < PGBUF_LRU_ZONE_ONE_TWO_COUNT(list))
/* macros for retrieving info on shared and private LRUs */
/* Limits for private chains */
#define PGBUF_PRIVATE_LRU_MIN_COUNT 4
#define PGBUF_PRIVATE_LRU_MAX_HARD_QUOTA 5000
/* Lower limit for number of pages in shared LRUs: used to compute number of private lists and number of shared lists */
#define PGBUF_MIN_PAGES_IN_SHARED_LIST 1000
#define PGBUF_MIN_SHARED_LIST_ADJUST_SIZE 50
#define PGBUF_PAGE_QUOTA_IS_ENABLED (pgbuf_Pool.quota.num_private_LRU_list > 0)
/* macros for retrieving id of private chains of thread (to use actual LRU index use PGBUF_LRU_INDEX_FROM_PRIVATE on
* this result.
*/
#if defined (SERVER_MODE)
#define PGBUF_PRIVATE_LRU_FROM_THREAD(thread_p) \
((thread_p) != NULL) ? ((thread_p)->private_lru_index) : (0)
static bool
PGBUF_THREAD_HAS_PRIVATE_LRU (THREAD_ENTRY * thread_p)
{
return thread_p != NULL && thread_p->m_is_private_lru_enabled;
}
#else
#define PGBUF_PRIVATE_LRU_FROM_THREAD(thread_p) 0
#define PGBUF_THREAD_HAS_PRIVATE_LRU(thread_p) false
#endif
#define PGBUF_SHARED_LRU_COUNT (pgbuf_Pool.num_LRU_list)
#define PGBUF_PRIVATE_LRU_COUNT (pgbuf_Pool.quota.num_private_LRU_list)
#define PGBUF_TOTAL_LRU_COUNT (PGBUF_SHARED_LRU_COUNT + PGBUF_PRIVATE_LRU_COUNT)
#define PGBUF_PRIVATE_LIST_FROM_LRU_INDEX(i) ((i) - PGBUF_SHARED_LRU_COUNT)
#define PGBUF_LRU_INDEX_FROM_PRIVATE(private_id) (PGBUF_SHARED_LRU_COUNT + (private_id))
#define PGBUF_IS_SHARED_LRU_INDEX(lru_idx) ((lru_idx) < PGBUF_SHARED_LRU_COUNT)
#define PGBUF_IS_PRIVATE_LRU_INDEX(lru_idx) ((lru_idx) >= PGBUF_SHARED_LRU_COUNT)
#define PGBUF_LRU_LIST_IS_OVER_QUOTA(list) (PGBUF_LRU_LIST_COUNT (list) > (list)->quota)
#define PGBUF_LRU_LIST_IS_ONE_TWO_OVER_QUOTA(list) ((PGBUF_LRU_ZONE_ONE_TWO_COUNT (list) > (list)->quota))
#define PGBUF_LRU_LIST_OVER_QUOTA_COUNT(list) (PGBUF_LRU_LIST_COUNT (list) - (list)->quota)
#define PGBUF_IS_PRIVATE_LRU_OVER_QUOTA(lru_idx) \
(PGBUF_IS_PRIVATE_LRU_INDEX (lru_idx) && PGBUF_LRU_LIST_IS_OVER_QUOTA (PGBUF_GET_LRU_LIST (lru_idx)))
#define PGBUF_IS_PRIVATE_LRU_ONE_TWO_OVER_QUOTA(lru_idx) \
(PGBUF_IS_PRIVATE_LRU_INDEX (lru_idx) && PGBUF_LRU_LIST_IS_ONE_TWO_OVER_QUOTA (PGBUF_GET_LRU_LIST (lru_idx)))
#define PGBUF_OVER_QUOTA_BUFFER(quota) MAX (10, (int) (quota * 0.01f))
#define PGBUF_LRU_LIST_IS_OVER_QUOTA_WITH_BUFFER(list) \
(PGBUF_LRU_LIST_COUNT (list) > (list)->quota + PGBUF_OVER_QUOTA_BUFFER ((list)->quota))
#define PBGUF_BIG_PRIVATE_MIN_SIZE 100
/* LRU flags */
#define PGBUF_LRU_VICTIM_LFCQ_FLAG ((int) 0x80000000)
#if defined (NDEBUG)
/* note: release bugs can be hard to debug due to compile optimization. the crash call-stack may point to a completely
* different code than the one that caused the crash. my workaround is to save the line of code in this global
* variable pgbuf_Abort_release_line.
*
* careful about overusing this. the code may not be fully optimized when using it. */
static int pgbuf_Abort_release_line = 0;
#define PGBUF_ABORT_RELEASE() do { pgbuf_Abort_release_line = __LINE__; abort (); } while (false)
#else /* DEBUG */
#define PGBUF_ABORT_RELEASE() assert (false)
#endif /* DEBUG */
static INLINE unsigned int pgbuf_hash_func_mirror (const VPID * vpid) __attribute__ ((ALWAYS_INLINE));
static INLINE bool pgbuf_is_temporary_volume (VOLID volid) __attribute__ ((ALWAYS_INLINE));
static int pgbuf_initialize_bcb_table (void);
static int pgbuf_initialize_hash_table (void);
static int pgbuf_initialize_lock_table (void);
static int pgbuf_initialize_lru_list (void);
static int pgbuf_initialize_aout_list (void);
static int pgbuf_initialize_invalid_list (void);
static int pgbuf_initialize_page_quota_parameters (void);
static int pgbuf_initialize_page_quota (void);
static int pgbuf_initialize_page_monitor (void);
static int pgbuf_initialize_thrd_holder (void);
STATIC_INLINE PGBUF_HOLDER *pgbuf_allocate_thrd_holder_entry (THREAD_ENTRY * thread_p) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE PGBUF_HOLDER *pgbuf_find_thrd_holder (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE int pgbuf_remove_thrd_holder (THREAD_ENTRY * thread_p, PGBUF_HOLDER * holder)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE int pgbuf_unlatch_thrd_holder (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr,
PGBUF_HOLDER_STAT * holder_perf_stat_p) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE int pgbuf_unlatch_bcb_upon_unfix (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, int holder_status)
__attribute__ ((ALWAYS_INLINE));
static void pgbuf_unlatch_void_zone_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int thread_private_lru_index);
STATIC_INLINE bool pgbuf_should_move_private_to_shared (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb,
int thread_private_lru_index) __attribute__ ((ALWAYS_INLINE));
static int pgbuf_block_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, PGBUF_LATCH_MODE request_mode,
int request_fcnt, bool as_promote);
STATIC_INLINE int pgbuf_latch_bcb_upon_fix (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, PGBUF_LATCH_MODE request_mode,
int buf_lock_acquired, PGBUF_LATCH_CONDITION condition,
bool * is_latch_wait) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE PGBUF_BCB *pgbuf_search_hash_chain (THREAD_ENTRY * thread_p, PGBUF_BUFFER_HASH * hash_anchor,
const VPID * vpid) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE PAGE_PTR pgbuf_lockfree_fix_ro (THREAD_ENTRY * thread_p, const VPID * vpid, PAGE_FETCH_MODE fetch_mode)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE PGBUF_BCB *pgbuf_search_hash_chain_no_bcb_lock (THREAD_ENTRY * thread_p, PGBUF_BUFFER_HASH * hash_anchor,
const VPID * vpid) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_lockfree_unfix_ro (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE int pgbuf_insert_into_hash_chain (THREAD_ENTRY * thread_p, PGBUF_BUFFER_HASH * hash_anchor,
PGBUF_BCB * bufptr) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE int pgbuf_delete_from_hash_chain (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
__attribute__ ((ALWAYS_INLINE));
static int pgbuf_lock_page (THREAD_ENTRY * thread_p, PGBUF_BUFFER_HASH * hash_anchor, const VPID * vpid);
static int pgbuf_unlock_page (THREAD_ENTRY * thread_p, PGBUF_BUFFER_HASH * hash_anchor, const VPID * vpid,
int need_hash_mutex);
static PGBUF_BCB *pgbuf_allocate_bcb (THREAD_ENTRY * thread_p, const VPID * src_vpid);
static PGBUF_BCB *pgbuf_claim_bcb_for_fix (THREAD_ENTRY * thread_p, const VPID * vpid, PAGE_FETCH_MODE fetch_mode,
PGBUF_BUFFER_HASH * hash_anchor, PGBUF_FIX_PERF * perf, bool * try_again,
bool already_locked);
static int pgbuf_victimize_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr);
static int pgbuf_bcb_safe_flush_internal (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, bool synchronous, bool * locked);
static int pgbuf_invalidate_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr);
static int pgbuf_bcb_safe_flush_force_lock (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, bool synchronous);
static int pgbuf_bcb_safe_flush_force_unlock (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, bool synchronous);
static PGBUF_BCB *pgbuf_get_bcb_from_invalid_list (THREAD_ENTRY * thread_p);
static int pgbuf_put_bcb_into_invalid_list (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr);
STATIC_INLINE int pgbuf_get_shared_lru_index_for_add (void) __attribute__ ((ALWAYS_INLINE));
static int pgbuf_get_victim_candidates_from_lru (THREAD_ENTRY * thread_p, int check_count,
float lru_sum_flush_priority, bool * assigned_directly);
static PGBUF_BCB *pgbuf_get_victim (THREAD_ENTRY * thread_p);
static PGBUF_BCB *pgbuf_get_victim_from_lru_list (THREAD_ENTRY * thread_p, const int lru_idx);
#if defined (SERVER_MODE)
static int pgbuf_panic_assign_direct_victims_from_lru (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list,
PGBUF_BCB * bcb_start);
STATIC_INLINE void pgbuf_lfcq_assign_direct_victims (THREAD_ENTRY * thread_p, int lru_idx, int *nassign_inout)
__attribute__ ((ALWAYS_INLINE));
#endif /* SERVER_MODE */
STATIC_INLINE void pgbuf_add_vpid_to_aout_list (THREAD_ENTRY * thread_p, const VPID * vpid, const int lru_idx)
__attribute__ ((ALWAYS_INLINE));
static int pgbuf_remove_vpid_from_aout_list (THREAD_ENTRY * thread_p, const VPID * vpid);
static int pgbuf_remove_private_from_aout_list (const int lru_idx);
STATIC_INLINE void pgbuf_remove_from_lru_list (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, PGBUF_LRU_LIST * lru_list)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_add_bcb_to_top (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, PGBUF_LRU_LIST * lru_list)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_add_bcb_to_middle (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, PGBUF_LRU_LIST * lru_list)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_add_bcb_to_bottom (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, PGBUF_LRU_LIST * lru_list)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_adjust_zone1 (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, bool min_one)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_adjust_zone2 (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, bool min_one)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_adjust_zones (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, bool min_one)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_fall_bcb_to_zone_3 (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, PGBUF_LRU_LIST * lru_list)
__attribute__ ((ALWAYS_INLINE));
static void pgbuf_lru_boost_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb);
STATIC_INLINE void pgbuf_lru_add_new_bcb_to_top (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int lru_idx)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_add_new_bcb_to_middle (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int lru_idx)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_add_new_bcb_to_bottom (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int lru_idx)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_remove_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
static void pgbuf_lru_move_from_private_to_shared (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb);
static void pgbuf_move_bcb_to_bottom_lru (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb);
STATIC_INLINE int pgbuf_bcb_flush_with_wal (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, bool is_page_flush_thread,
bool * is_bcb_locked) __attribute__ ((ALWAYS_INLINE));
static void pgbuf_wake_flush_waiters (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb);
STATIC_INLINE bool pgbuf_is_exist_blocked_reader_writer (PGBUF_BCB * bufptr) __attribute__ ((ALWAYS_INLINE));
static int pgbuf_flush_all_helper (THREAD_ENTRY * thread_p, VOLID volid, bool is_only_fixed, bool is_set_lsa_as_null);
#if defined(SERVER_MODE)
static int pgbuf_timed_sleep_error_handling (THREAD_ENTRY * thrd_entry, PGBUF_BCB * bufptr);
static int pgbuf_timed_sleep (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr);
STATIC_INLINE void pgbuf_wakeup_reader_writer (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
__attribute__ ((ALWAYS_INLINE));
#endif /* SERVER_MODE */
STATIC_INLINE bool pgbuf_get_check_page_validation_level (int page_validation_level) __attribute__ ((ALWAYS_INLINE));
static bool pgbuf_is_valid_page_ptr (const PAGE_PTR pgptr);
STATIC_INLINE void pgbuf_set_bcb_page_vpid (PGBUF_BCB * bufptr) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_check_bcb_page_vpid (PGBUF_BCB * bufptr, bool maybe_deallocated)
__attribute__ ((ALWAYS_INLINE));
#if defined(CUBRID_DEBUG)
static void pgbuf_scramble (FILEIO_PAGE * iopage);
static void pgbuf_dump (void);
static int pgbuf_is_consistent (const PGBUF_BCB * bufptr, int likely_bad_after_fixcnt);
#endif /* CUBRID_DEBUG */
#if !defined(NDEBUG)
static void pgbuf_add_fixed_at (PGBUF_HOLDER * holder, const char *caller_file, int caller_line, bool reset);
#endif
#if defined(SERVER_MODE)
static void pgbuf_sleep (THREAD_ENTRY * thread_p, pthread_mutex_t * mutex_p);
STATIC_INLINE int pgbuf_wakeup (THREAD_ENTRY * thread_p) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE int pgbuf_wakeup_uncond (THREAD_ENTRY * thread_p) __attribute__ ((ALWAYS_INLINE));
#endif /* SERVER_MODE */
STATIC_INLINE void pgbuf_set_dirty_buffer_ptr (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
__attribute__ ((ALWAYS_INLINE));
static int pgbuf_compare_victim_list (const void *p1, const void *p2);
static void pgbuf_wakeup_page_flush_daemon (THREAD_ENTRY * thread_p);
STATIC_INLINE bool pgbuf_check_page_ptype_internal (PAGE_PTR pgptr, PAGE_TYPE ptype, bool no_error)
__attribute__ ((ALWAYS_INLINE));
#if defined (SERVER_MODE)
static bool pgbuf_is_thread_high_priority (THREAD_ENTRY * thread_p);
#endif /* SERVER_MODE */
static int pgbuf_flush_page_and_neighbors_fb (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, int *flushed_pages);
STATIC_INLINE void pgbuf_add_bufptr_to_batch (PGBUF_BCB * bufptr, int idx) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE int pgbuf_flush_neighbor_safe (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, VPID * expected_vpid,
bool * flushed) __attribute__ ((ALWAYS_INLINE));
static int pgbuf_get_groupid_and_unfix (THREAD_ENTRY * thread_p, const VPID * req_vpid, PAGE_PTR * pgptr,
VPID * groupid, bool do_unfix);
#if !defined(NDEBUG)
STATIC_INLINE void pgbuf_add_watch_instance_internal (PGBUF_HOLDER * holder, PAGE_PTR pgptr, PGBUF_WATCHER * watcher,
const PGBUF_LATCH_MODE latch_mode, const bool clear_unfix_flag,
const char *caller_file, const int caller_line)
__attribute__ ((ALWAYS_INLINE));
#else
STATIC_INLINE void pgbuf_add_watch_instance_internal (PGBUF_HOLDER * holder, PAGE_PTR pgptr, PGBUF_WATCHER * watcher,
const PGBUF_LATCH_MODE latch_mode, const bool clear_unfix_flag)
__attribute__ ((ALWAYS_INLINE));
#endif
static PGBUF_HOLDER *pgbuf_get_holder (THREAD_ENTRY * thread_p, PAGE_PTR pgptr);
static void pgbuf_remove_watcher (PGBUF_HOLDER * holder, PGBUF_WATCHER * watcher_object);
static int pgbuf_flush_chkpt_seq_list (THREAD_ENTRY * thread_p, PGBUF_SEQ_FLUSHER * seq_flusher,
const LOG_LSA * prev_chkpt_redo_lsa, LOG_LSA * chkpt_smallest_lsa);
static int pgbuf_flush_seq_list (THREAD_ENTRY * thread_p, PGBUF_SEQ_FLUSHER * seq_flusher, struct timeval *limit_time,
const LOG_LSA * prev_chkpt_redo_lsa, LOG_LSA * chkpt_smallest_lsa, int *time_rem);
static int pgbuf_initialize_seq_flusher (PGBUF_SEQ_FLUSHER * seq_flusher, PGBUF_VICTIM_CANDIDATE_LIST * f_list,
const int cnt);
static const char *pgbuf_latch_mode_str (PGBUF_LATCH_MODE latch_mode);
static const char *pgbuf_zone_str (PGBUF_ZONE zone);
static const char *pgbuf_consistent_str (int consistent);
static void pgbuf_compute_lru_vict_target (float *lru_sum_flush_priority);
STATIC_INLINE bool pgbuf_is_bcb_victimizable (PGBUF_BCB * bcb, bool has_mutex_lock) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_is_bcb_fixed_by_any (PGBUF_BCB * bcb, bool has_mutex_lock) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_assign_direct_victim (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
__attribute__ ((ALWAYS_INLINE));
#if defined (SERVER_MODE)
STATIC_INLINE bool pgbuf_get_thread_waiting_for_direct_victim (REFPTR (THREAD_ENTRY, waiting_thread_out))
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE PGBUF_BCB *pgbuf_get_direct_victim (THREAD_ENTRY * thread_p) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_is_any_thread_waiting_for_direct_victim (void) __attribute__ ((ALWAYS_INLINE));
#endif /* SERVER_MODE */
STATIC_INLINE void pgbuf_lru_add_victim_candidate (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, PGBUF_BCB * bcb)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_remove_victim_candidate (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list,
PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_lru_advance_victim_hint (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list,
PGBUF_BCB * bcb_prev_hint, PGBUF_BCB * bcb_new_hint,
bool was_vict_count_updated) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE PGBUF_LRU_LIST *pgbuf_lru_list_from_bcb (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_register_hit_for_lru (PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_update_flags (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int set_flags, int clear_flags)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_change_zone (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int lru_idx, PGBUF_ZONE zone)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE PGBUF_ZONE pgbuf_bcb_get_zone (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE int pgbuf_bcb_get_lru_index (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE int pgbuf_bcb_get_pool_index (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_is_dirty (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_mark_is_flushing (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_is_flushing (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_is_direct_victim (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_is_invalid_direct_victim (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_is_async_flush_request (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_is_to_vacuum (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_should_be_moved_to_bottom_lru (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_avoid_victim (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_set_dirty (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_clear_dirty (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_mark_was_flushed (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_mark_was_not_flushed (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, bool mark_dirty)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_register_avoid_deallocation (PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_unregister_avoid_deallocation (PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_should_avoid_deallocation (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_check_and_reset_fix_and_avoid_dealloc (PGBUF_BCB * bcb, const char *file, int line)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void pgbuf_bcb_register_fix (PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool pgbuf_bcb_is_hot (const PGBUF_BCB * bcb) __attribute__ ((ALWAYS_INLINE));
#if defined (SERVER_MODE)
static void pgbuf_bcbmon_lock (PGBUF_BCB * bcb, int caller_line);
static int pgbuf_bcbmon_trylock (PGBUF_BCB * bcb, int caller_line);
static void pgbuf_bcbmon_unlock (PGBUF_BCB * bcb);
static void pgbuf_bcbmon_check_own (PGBUF_BCB * bcb);
static void pgbuf_bcbmon_check_mutex_leaks (void);
#endif /* SERVER_MODE */
STATIC_INLINE bool pgbuf_lfcq_add_lru_with_victims (PGBUF_LRU_LIST * lru_list) __attribute__ ((ALWAYS_INLINE));
static PGBUF_BCB *pgbuf_lfcq_get_victim_from_private_lru (THREAD_ENTRY * thread_p, bool restricted);
static PGBUF_BCB *pgbuf_lfcq_get_victim_from_shared_lru (THREAD_ENTRY * thread_p, bool multi_threaded);
STATIC_INLINE bool pgbuf_is_hit_ratio_low (void);
static void pgbuf_flags_mask_sanity_check (void);
static void pgbuf_lru_sanity_check (const PGBUF_LRU_LIST * lru);
// TODO: find a better place for this, but not log_impl.h
STATIC_INLINE int pgbuf_find_current_wait_msecs (THREAD_ENTRY * thread_p) __attribute__ ((ALWAYS_INLINE));
static bool pgbuf_is_temp_lsa (const log_lsa & lsa);
static void pgbuf_init_temp_page_lsa (FILEIO_PAGE * io_page, PGLENGTH page_size);
static void pgbuf_scan_bcb_table (THREAD_ENTRY * thread_p);
#if defined (SERVER_MODE)
// *INDENT-OFF*
static cubthread::daemon *pgbuf_Page_maintenance_daemon = NULL;
static cubthread::daemon *pgbuf_Page_flush_daemon = NULL;
static cubthread::daemon *pgbuf_Page_post_flush_daemon = NULL;
static cubthread::daemon *pgbuf_Flush_control_daemon = NULL;
// *INDENT-ON*
#endif /* SERVER_MODE */
static bool pgbuf_is_page_flush_daemon_available ();
STATIC_INLINE void set_latch (PGBUF_ATOMIC_LATCH * latch, PGBUF_LATCH_MODE latch_mode) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void add_fcnt (PGBUF_ATOMIC_LATCH * latch, int cnt) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void set_latch_and_fcnt (PGBUF_ATOMIC_LATCH * latch, PGBUF_LATCH_MODE latch_mode, int cnt)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void set_latch_and_add_fcnt (PGBUF_ATOMIC_LATCH * latch, PGBUF_LATCH_MODE latch_mode, int cnt)
__attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void set_waiter_exists (PGBUF_ATOMIC_LATCH * latch, bool waiter_exists) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE int get_fcnt (PGBUF_ATOMIC_LATCH * latch) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE bool get_waiter_exists (PGBUF_ATOMIC_LATCH * latch) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE PGBUF_LATCH_MODE get_latch (PGBUF_ATOMIC_LATCH * latch) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE PGBUF_ATOMIC_LATCH_IMPL get_impl (PGBUF_ATOMIC_LATCH * latch) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void copy_bcb (PGBUF_BCB * dest_bcb, PGBUF_BCB * src_bcb) __attribute__ ((ALWAYS_INLINE));
STATIC_INLINE void
copy_bcb (PGBUF_BCB * dest_bcb, PGBUF_BCB * src_bcb)
{
#if defined(SERVER_MODE)
dest_bcb->owner_mutex = src_bcb->owner_mutex;
#endif /* SERVER_MODE */
dest_bcb->vpid = src_bcb->vpid;
dest_bcb->atomic_latch.store (src_bcb->atomic_latch.load ());
dest_bcb->flags = src_bcb->flags;
#if defined(SERVER_MODE)
dest_bcb->next_wait_thrd = src_bcb->next_wait_thrd;
#endif /* SERVER_MODE */
#if defined(SERVER_MODE)
dest_bcb->latch_last_thread = src_bcb->latch_last_thread;
#endif /* SERVER_MODE && !NDEBUG */
dest_bcb->hash_next = src_bcb->hash_next;
dest_bcb->prev_BCB = src_bcb->prev_BCB;
dest_bcb->next_BCB = src_bcb->next_BCB;
dest_bcb->tick_lru_list = src_bcb->tick_lru_list;
dest_bcb->tick_lru3 = src_bcb->tick_lru3;
dest_bcb->count_fix_and_avoid_dealloc = src_bcb->count_fix_and_avoid_dealloc;
dest_bcb->hit_age = src_bcb->hit_age;
dest_bcb->oldest_unflush_lsa = src_bcb->oldest_unflush_lsa;
dest_bcb->iopage_buffer = src_bcb->iopage_buffer;
}
STATIC_INLINE void
set_latch (PGBUF_ATOMIC_LATCH * latch, PGBUF_LATCH_MODE latch_mode)
{
PGBUF_ATOMIC_LATCH_IMPL impl;
PGBUF_ATOMIC_LATCH_IMPL new_impl;
do
{
impl.raw = latch->load (std::memory_order_acquire);
new_impl = impl;
new_impl.impl.latch_mode = latch_mode;
}
while (!latch->compare_exchange_weak (impl.raw, new_impl.raw, std::memory_order_acq_rel, std::memory_order_acquire));
}
STATIC_INLINE void
add_fcnt (PGBUF_ATOMIC_LATCH * latch, int cnt)
{
PGBUF_ATOMIC_LATCH_IMPL impl;
PGBUF_ATOMIC_LATCH_IMPL new_impl;
do
{
impl.raw = latch->load (std::memory_order_acquire);
new_impl = impl;
new_impl.impl.fcnt += cnt;
}
while (!latch->compare_exchange_weak (impl.raw, new_impl.raw, std::memory_order_acq_rel, std::memory_order_acquire));
}
STATIC_INLINE void
set_latch_and_fcnt (PGBUF_ATOMIC_LATCH * latch, PGBUF_LATCH_MODE latch_mode, int cnt)
{
PGBUF_ATOMIC_LATCH_IMPL impl;
PGBUF_ATOMIC_LATCH_IMPL new_impl;
do
{
impl.raw = latch->load (std::memory_order_acquire);
new_impl = impl;
new_impl.impl.latch_mode = latch_mode;
new_impl.impl.fcnt = cnt;
}
while (!latch->compare_exchange_weak (impl.raw, new_impl.raw, std::memory_order_acq_rel, std::memory_order_acquire));
}
STATIC_INLINE void
set_latch_and_add_fcnt (PGBUF_ATOMIC_LATCH * latch, PGBUF_LATCH_MODE latch_mode, int cnt)
{
PGBUF_ATOMIC_LATCH_IMPL impl;
PGBUF_ATOMIC_LATCH_IMPL new_impl;
do
{
impl.raw = latch->load (std::memory_order_acquire);
new_impl = impl;
new_impl.impl.latch_mode = latch_mode;
new_impl.impl.fcnt += cnt;
}
while (!latch->compare_exchange_weak (impl.raw, new_impl.raw, std::memory_order_acq_rel, std::memory_order_acquire));
}
STATIC_INLINE void
set_waiter_exists (PGBUF_ATOMIC_LATCH * latch, bool waiter_exists)
{
PGBUF_ATOMIC_LATCH_IMPL impl;
PGBUF_ATOMIC_LATCH_IMPL new_impl;
do
{
impl.raw = latch->load (std::memory_order_acquire);
new_impl = impl;
new_impl.impl.waiter_exists = waiter_exists;
}
while (!latch->compare_exchange_weak (impl.raw, new_impl.raw, std::memory_order_acq_rel, std::memory_order_acquire));
}
STATIC_INLINE int
get_fcnt (PGBUF_ATOMIC_LATCH * latch)
{
PGBUF_ATOMIC_LATCH_IMPL impl;
impl.raw = latch->load (std::memory_order_acquire);
return impl.impl.fcnt;
}
STATIC_INLINE bool
get_waiter_exists (PGBUF_ATOMIC_LATCH * latch)
{
PGBUF_ATOMIC_LATCH_IMPL impl;
impl.raw = latch->load (std::memory_order_acquire);
return impl.impl.waiter_exists;
}
STATIC_INLINE PGBUF_LATCH_MODE
get_latch (PGBUF_ATOMIC_LATCH * latch)
{
PGBUF_ATOMIC_LATCH_IMPL impl;
impl.raw = latch->load (std::memory_order_acquire);
return impl.impl.latch_mode;
}
STATIC_INLINE PGBUF_ATOMIC_LATCH_IMPL
get_impl (PGBUF_ATOMIC_LATCH * latch)
{
PGBUF_ATOMIC_LATCH_IMPL impl;
impl.raw = latch->load (std::memory_order_acquire);
return impl;
}
void
pgbuf_thread_variables_init (THREAD_ENTRY * thread_p)
{
if (!thread_p)
{
return;
}
if (pgbuf_Pool.quota.num_private_LRU_list > 0 && thread_p->private_lru_index != -1)
{
thread_p->m_is_private_lru_enabled = true;
}
else
{
thread_p->m_is_private_lru_enabled = false;
}
if (!thread_p->m_holder_anchor)
{
thread_p->m_holder_anchor = &pgbuf_Pool.thrd_holder_info[thread_p->index];
}
}
/*
* pgbuf_hash_func_mirror () - Hash VPID into hash anchor
* return: hash value
* key_vpid(in): VPID to hash
*/
STATIC_INLINE unsigned int
pgbuf_hash_func_mirror (const VPID * vpid)
{
#define VOLID_LSB_BITS 8
int i;
unsigned int hash_val;
unsigned int volid_lsb;
unsigned int reversed_volid_lsb = 0;
unsigned int lsb_mask;
unsigned int reverse_mask;
volid_lsb = vpid->volid;
lsb_mask = 1;
reverse_mask = 1 << (HASH_SIZE_BITS - 1);
for (i = VOLID_LSB_BITS; i > 0; i--)
{
if (volid_lsb & lsb_mask)
{
reversed_volid_lsb |= reverse_mask;
}
reverse_mask = reverse_mask >> 1;
lsb_mask = lsb_mask << 1;
}
hash_val = vpid->pageid ^ reversed_volid_lsb;
hash_val = hash_val & ((1 << HASH_SIZE_BITS) - 1);
return hash_val;
#undef VOLID_LSB_BITS
}
/*
* pgbuf_hash_vpid () - Hash a volume_page identifier
* return: hash value
* key_vpid(in): VPID to hash
* htsize(in): Size of hash table
*/
unsigned int
pgbuf_hash_vpid (const void *key_vpid, unsigned int htsize)
{
const VPID *vpid = (VPID *) key_vpid;
return ((vpid->pageid | ((unsigned int) vpid->volid) << 24) % htsize);
}
/*
* pgbuf_compare_vpid () - Compare two vpids keys for hashing
* return: int (key_vpid1 == key_vpid2 ?)
* key_vpid1(in): First key
* key_vpid2(in): Second key
*/
int
pgbuf_compare_vpid (const void *key_vpid1, const void *key_vpid2)
{
const VPID *vpid1 = (VPID *) key_vpid1;
const VPID *vpid2 = (VPID *) key_vpid2;
if (vpid1->volid == vpid2->volid)
{
return vpid1->pageid - vpid2->pageid;
}
else
{
return vpid1->volid - vpid2->volid;
}
}
/*
* pgbuf_initialize () - Initialize the page buffer pool
* return: NO_ERROR, or ER_code
*
* Note: Function invalidates any resident page, creates a hash table for easy
* lookup of pages in the page buffer pool, and resets the clock tick for
* the page replacement algorithm.
*/
int
pgbuf_initialize (void)
{
pgbuf_flags_mask_sanity_check ();
/* Initialize all members individually */
pgbuf_Pool.num_buffers = 0;
pgbuf_Pool.BCB_table = NULL;
pgbuf_Pool.buf_hash_table = NULL;
pgbuf_Pool.buf_lock_table = NULL;
pgbuf_Pool.iopage_table = NULL;
pgbuf_Pool.num_LRU_list = 0;
pgbuf_Pool.ratio_lru1 = 0.0f;
pgbuf_Pool.ratio_lru2 = 0.0f;
pgbuf_Pool.buf_LRU_list = NULL;
pgbuf_Pool.victim_cand_list = NULL;
memset (&pgbuf_Pool.buf_AOUT_list, 0, sizeof (PGBUF_AOUT_LIST));
memset (&pgbuf_Pool.buf_invalid_list, 0, sizeof (PGBUF_INVALID_LIST));
memset (&pgbuf_Pool.seq_chkpt_flusher, 0, sizeof (PGBUF_SEQ_FLUSHER));
memset (&pgbuf_Pool.quota, 0, sizeof (PGBUF_PAGE_QUOTA));
memset (&pgbuf_Pool.show_status_old, 0, sizeof (PGBUF_STATUS_OLD));
memset (&pgbuf_Pool.show_status_snapshot, 0, sizeof (PGBUF_STATUS_SNAPSHOT));
#if defined (SERVER_MODE)
memset (&pgbuf_Pool.direct_victims, 0, sizeof (PGBUF_VICTIM_CANDIDATE_LIST));
#endif
/* Initialize monitor - C++ members need special handling */
pgbuf_Pool.monitor.dirties_cnt = 0;
pgbuf_Pool.monitor.lru_hits = NULL;
pgbuf_Pool.monitor.lru_activity = NULL;
pgbuf_Pool.monitor.lru_shared_pgs_cnt = 0;
pgbuf_Pool.monitor.pg_unfix_cnt.store (0);
pgbuf_Pool.monitor.lru_victim_req_cnt = 0;
pgbuf_Pool.monitor.fix_req_cnt.store (0);
#if defined (SERVER_MODE)
pgbuf_Pool.monitor.bcb_locks = NULL;
#endif
pgbuf_Pool.monitor.victim_rich = false;
pgbuf_Pool.thrd_holder_info = NULL;
pgbuf_Pool.thrd_reserved_holder = NULL;
#if defined(SERVER_MODE)
pgbuf_Pool.free_holder_set_mutex = PTHREAD_MUTEX_INITIALIZER;
#endif
pgbuf_Pool.free_holder_set = NULL;
pgbuf_Pool.free_index = 0;
pgbuf_Pool.check_for_interrupts = false;
#if defined(SERVER_MODE)
pgbuf_Pool.is_flushing_victims = false;
pgbuf_Pool.is_checkpoint = false;
pgbuf_Pool.flushed_bcbs = NULL;
#endif
pgbuf_Pool.private_lrus_with_victims = NULL;
pgbuf_Pool.big_private_lrus_with_victims = NULL;
pgbuf_Pool.shared_lrus_with_victims = NULL;
pgbuf_Pool.show_status = NULL;
#if defined (SERVER_MODE)
pgbuf_Pool.show_status_mutex = PTHREAD_MUTEX_INITIALIZER;
#endif
pgbuf_Pool.num_buffers = prm_get_integer_value (PRM_ID_PB_NBUFFERS);
if (pgbuf_Pool.num_buffers < PGBUF_MINIMUM_BUFFERS)
{
#if defined(CUBRID_DEBUG)
er_log_debug (ARG_FILE_LINE, "pgbuf_initialize: WARNING Num_buffers = %d is too small. %d was assumed",
pgbuf_Pool.num_buffers, PGBUF_MINIMUM_BUFFERS);
#endif /* CUBRID_DEBUG */
pgbuf_Pool.num_buffers = PGBUF_MINIMUM_BUFFERS;
}
pgbuf_latch_timeout = prm_get_integer_value (PRM_ID_PAGE_LATCH_TIMEOUT) * 1000;
#if defined (SERVER_MODE)
#if defined (NDEBUG)
pgbuf_Monitor_locks = prm_get_bool_value (PRM_ID_PB_MONITOR_LOCKS);
#else /* !NDEBUG */
pgbuf_Monitor_locks = true;
#endif /* !NDEBUG */
#endif /* SERVER_MODE */
/* set ratios for lru zones */
pgbuf_Pool.ratio_lru1 = prm_get_float_value (PRM_ID_PB_LRU_HOT_RATIO);
pgbuf_Pool.ratio_lru2 = prm_get_float_value (PRM_ID_PB_LRU_BUFFER_RATIO);
pgbuf_Pool.ratio_lru1 = MAX (pgbuf_Pool.ratio_lru1, PGBUF_LRU_ZONE_MIN_RATIO);
pgbuf_Pool.ratio_lru1 = MIN (pgbuf_Pool.ratio_lru1, PGBUF_LRU_ZONE_MAX_RATIO);
pgbuf_Pool.ratio_lru2 = MAX (pgbuf_Pool.ratio_lru2, PGBUF_LRU_ZONE_MIN_RATIO);
pgbuf_Pool.ratio_lru2 = MIN (pgbuf_Pool.ratio_lru2, 1.0f - PGBUF_LRU_ZONE_MIN_RATIO - pgbuf_Pool.ratio_lru1);
assert (pgbuf_Pool.ratio_lru2 >= PGBUF_LRU_ZONE_MIN_RATIO && pgbuf_Pool.ratio_lru2 <= PGBUF_LRU_ZONE_MAX_RATIO);
assert ((pgbuf_Pool.ratio_lru1 + pgbuf_Pool.ratio_lru2) >= 0.099f
&& (pgbuf_Pool.ratio_lru1 + pgbuf_Pool.ratio_lru2) <= 0.951f);
/* keep page quota parameter initializer first */
if (pgbuf_initialize_page_quota_parameters () != NO_ERROR)
{
goto error;
}
if (pgbuf_initialize_bcb_table () != NO_ERROR)
{
goto error;
}
if (pgbuf_initialize_hash_table () != NO_ERROR)
{
goto error;
}
if (pgbuf_initialize_lock_table () != NO_ERROR)
{
goto error;
}
if (pgbuf_initialize_lru_list () != NO_ERROR)
{
goto error;
}
if (pgbuf_initialize_invalid_list () != NO_ERROR)
{
goto error;
}
if (pgbuf_initialize_aout_list () != NO_ERROR)
{
goto error;
}
if (pgbuf_initialize_thrd_holder () != NO_ERROR)
{
goto error;
}
/* keep page quota initializer first */
if (pgbuf_initialize_page_quota () != NO_ERROR)
{
goto error;
}
if (pgbuf_initialize_page_monitor () != NO_ERROR)
{
goto error;
}
pgbuf_Pool.check_for_interrupts = false;
pgbuf_Pool.victim_cand_list =
((PGBUF_VICTIM_CANDIDATE_LIST *) malloc (pgbuf_Pool.num_buffers * sizeof (PGBUF_VICTIM_CANDIDATE_LIST)));
if (pgbuf_Pool.victim_cand_list == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1,
(pgbuf_Pool.num_buffers * sizeof (PGBUF_VICTIM_CANDIDATE_LIST)));
goto error;
}
#if defined (SERVER_MODE)
pgbuf_Pool.is_flushing_victims = false;
pgbuf_Pool.is_checkpoint = false;
#endif
{
int cnt;
cnt = (int) (0.25f * pgbuf_Pool.num_buffers);
cnt = MIN (cnt, 65536);
if (pgbuf_initialize_seq_flusher (&(pgbuf_Pool.seq_chkpt_flusher), NULL, cnt) != NO_ERROR)
{
goto error;
}
}
/* TODO[arnia] : not required, if done in monitor initialization */
pgbuf_Pool.monitor.dirties_cnt = 0;
#if defined (SERVER_MODE)
pgbuf_Pool.direct_victims.bcb_victims = (PGBUF_BCB **) malloc (thread_num_total_threads () * sizeof (PGBUF_BCB *));
if (pgbuf_Pool.direct_victims.bcb_victims == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1,
thread_num_total_threads () * sizeof (PGBUF_BCB *));
goto error;
}
memset (pgbuf_Pool.direct_victims.bcb_victims, 0, thread_num_total_threads () * sizeof (PGBUF_BCB *));
/* *INDENT-OFF* */
pgbuf_Pool.direct_victims.waiter_threads_high_priority =
new lockfree::circular_queue<THREAD_ENTRY *> (thread_num_total_threads ());
/* *INDENT-ON* */
if (pgbuf_Pool.direct_victims.waiter_threads_high_priority == NULL)
{
ASSERT_ERROR ();
goto error;
}
/* *INDENT-OFF* */
pgbuf_Pool.direct_victims.waiter_threads_low_priority =
new lockfree::circular_queue<THREAD_ENTRY *> (2 * thread_num_total_threads ());
/* *INDENT-ON* */
if (pgbuf_Pool.direct_victims.waiter_threads_low_priority == NULL)
{
ASSERT_ERROR ();
goto error;
}
/* *INDENT-OFF* */
pgbuf_Pool.flushed_bcbs = new lockfree::circular_queue<PGBUF_BCB *> (PGBUF_FLUSHED_BCBS_BUFFER_SIZE);
/* *INDENT-ON* */
if (pgbuf_Pool.flushed_bcbs == NULL)
{
ASSERT_ERROR ();
goto error;
}
#endif /* SERVER_MODE */
if (PGBUF_PAGE_QUOTA_IS_ENABLED)
{
/* *INDENT-OFF* */
pgbuf_Pool.private_lrus_with_victims = new lockfree::circular_queue<int> (PGBUF_PRIVATE_LRU_COUNT * 2);
/* *INDENT-ON* */
if (pgbuf_Pool.private_lrus_with_victims == NULL)
{
ASSERT_ERROR ();
goto error;
}
/* *INDENT-OFF* */
pgbuf_Pool.big_private_lrus_with_victims = new lockfree::circular_queue<int> (PGBUF_PRIVATE_LRU_COUNT * 2);
/* *INDENT-ON* */
if (pgbuf_Pool.big_private_lrus_with_victims == NULL)
{
ASSERT_ERROR ();
goto error;
}
}
/* *INDENT-OFF* */
pgbuf_Pool.shared_lrus_with_victims = new lockfree::circular_queue<int> (PGBUF_SHARED_LRU_COUNT * 2);
/* *INDENT-ON* */
if (pgbuf_Pool.shared_lrus_with_victims == NULL)
{
ASSERT_ERROR ();
goto error;
}
pgbuf_Pool.show_status = (PGBUF_STATUS *) malloc (sizeof (PGBUF_STATUS) * (MAX_NTRANS + 1));
if (pgbuf_Pool.show_status == NULL)
{
ASSERT_ERROR ();
goto error;
}
memset (pgbuf_Pool.show_status, 0, sizeof (PGBUF_STATUS) * (MAX_NTRANS + 1));
pgbuf_Pool.show_status_old.print_out_time = time (NULL);
#if defined(SERVER_MODE)
pthread_mutex_init (&pgbuf_Pool.show_status_mutex, NULL);
#endif
return NO_ERROR;
error:
/* destroy mutexes and deallocate all the allocated memory */
pgbuf_finalize ();
return ER_FAILED;
}
/*
* pgbuf_finalize () - Terminate the page buffer manager
* return: void
*
* Note: Function invalidates any resident page, destroys the hash table used
* for lookup of pages in the page buffer pool.
*/
void
pgbuf_finalize (void)
{
PGBUF_BCB *bufptr;
PGBUF_HOLDER_SET *holder_set;
int i;
size_t hash_size, j;
#if defined(CUBRID_DEBUG)
pgbuf_dump_if_any_fixed ();
#endif /* CUBRID_DEBUG */
/* final task for buffer hash table */
if (pgbuf_Pool.buf_hash_table != NULL)
{
hash_size = PGBUF_HASH_SIZE;
for (j = 0; j < hash_size; j++)
{
pthread_mutex_destroy (&pgbuf_Pool.buf_hash_table[j].hash_mutex);
}
free_and_init (pgbuf_Pool.buf_hash_table);
}
/* final task for buffer lock table */
if (pgbuf_Pool.buf_lock_table != NULL)
{
free_and_init (pgbuf_Pool.buf_lock_table);
}
/* final task for BCB table */
if (pgbuf_Pool.BCB_table != NULL)
{
for (i = 0; i < pgbuf_Pool.num_buffers; i++)
{
bufptr = PGBUF_FIND_BCB_PTR (i);
pthread_mutex_destroy (&bufptr->mutex);
}
free_and_init (pgbuf_Pool.BCB_table);
pgbuf_Pool.num_buffers = 0;
}
if (pgbuf_Pool.iopage_table != NULL)
{
free_and_init (pgbuf_Pool.iopage_table);
}
/* final task for LRU list */
if (pgbuf_Pool.buf_LRU_list != NULL)
{
for (i = 0; i < PGBUF_TOTAL_LRU_COUNT; i++)
{
pthread_mutex_destroy (&pgbuf_Pool.buf_LRU_list[i].mutex);
}
free_and_init (pgbuf_Pool.buf_LRU_list);
}
/* final task for invalid BCB list */
pthread_mutex_destroy (&pgbuf_Pool.buf_invalid_list.invalid_mutex);
/* final task for thrd_holder_info */
if (pgbuf_Pool.thrd_holder_info != NULL)
{
free_and_init (pgbuf_Pool.thrd_holder_info);
}
if (pgbuf_Pool.thrd_reserved_holder != NULL)
{
free_and_init (pgbuf_Pool.thrd_reserved_holder);
}
/* final task for free holder set */
pthread_mutex_destroy (&pgbuf_Pool.free_holder_set_mutex);
while (pgbuf_Pool.free_holder_set != NULL)
{
holder_set = pgbuf_Pool.free_holder_set;
pgbuf_Pool.free_holder_set = holder_set->next_set;
free_and_init (holder_set);
}
if (pgbuf_Pool.victim_cand_list != NULL)
{
free_and_init (pgbuf_Pool.victim_cand_list);
}
if (pgbuf_Pool.buf_AOUT_list.bufarray != NULL)
{
free_and_init (pgbuf_Pool.buf_AOUT_list.bufarray);
}
if (pgbuf_Pool.buf_AOUT_list.aout_buf_ht != NULL)
{
for (i = 0; i < pgbuf_Pool.buf_AOUT_list.num_hashes; i++)
{
mht_destroy (pgbuf_Pool.buf_AOUT_list.aout_buf_ht[i]);
}
free_and_init (pgbuf_Pool.buf_AOUT_list.aout_buf_ht);
pgbuf_Pool.buf_AOUT_list.num_hashes = 0;
}
pthread_mutex_destroy (&pgbuf_Pool.buf_AOUT_list.Aout_mutex);
pgbuf_Pool.buf_AOUT_list.aout_buf_ht = NULL;
pgbuf_Pool.buf_AOUT_list.Aout_bottom = NULL;
pgbuf_Pool.buf_AOUT_list.Aout_top = NULL;
pgbuf_Pool.buf_AOUT_list.Aout_free = NULL;
pgbuf_Pool.buf_AOUT_list.max_count = 0;
if (pgbuf_Pool.seq_chkpt_flusher.flush_list != NULL)
{
free_and_init (pgbuf_Pool.seq_chkpt_flusher.flush_list);
}
/* Free quota structure data */
if (pgbuf_Pool.quota.lru_victim_flush_priority_per_lru != NULL)
{
free_and_init (pgbuf_Pool.quota.lru_victim_flush_priority_per_lru);
}
if (pgbuf_Pool.quota.private_lru_session_cnt != NULL)
{
free_and_init (pgbuf_Pool.quota.private_lru_session_cnt);
}
/* Free monitor structure data */
if (pgbuf_Pool.monitor.lru_hits != NULL)
{
free_and_init (pgbuf_Pool.monitor.lru_hits);
}
if (pgbuf_Pool.monitor.lru_activity != NULL)
{
free_and_init (pgbuf_Pool.monitor.lru_activity);
}
#if defined (SERVER_MODE)
if (pgbuf_Pool.monitor.bcb_locks != NULL)
{
free_and_init (pgbuf_Pool.monitor.bcb_locks);
}
if (pgbuf_Pool.direct_victims.bcb_victims != NULL)
{
free_and_init (pgbuf_Pool.direct_victims.bcb_victims);
}
if (pgbuf_Pool.direct_victims.waiter_threads_high_priority != NULL)
{
delete pgbuf_Pool.direct_victims.waiter_threads_high_priority;
pgbuf_Pool.direct_victims.waiter_threads_high_priority = NULL;
}
if (pgbuf_Pool.direct_victims.waiter_threads_low_priority != NULL)
{
delete pgbuf_Pool.direct_victims.waiter_threads_low_priority;
pgbuf_Pool.direct_victims.waiter_threads_low_priority = NULL;
}
if (pgbuf_Pool.flushed_bcbs != NULL)
{
delete pgbuf_Pool.flushed_bcbs;
pgbuf_Pool.flushed_bcbs = NULL;
}
#endif /* SERVER_MODE */
if (pgbuf_Pool.private_lrus_with_victims != NULL)
{
delete pgbuf_Pool.private_lrus_with_victims;
pgbuf_Pool.private_lrus_with_victims = NULL;
}
if (pgbuf_Pool.big_private_lrus_with_victims != NULL)
{
delete pgbuf_Pool.big_private_lrus_with_victims;
pgbuf_Pool.big_private_lrus_with_victims = NULL;
}
if (pgbuf_Pool.shared_lrus_with_victims != NULL)
{
delete pgbuf_Pool.shared_lrus_with_victims;
pgbuf_Pool.shared_lrus_with_victims = NULL;
}
if (pgbuf_Pool.show_status != NULL)
{
free (pgbuf_Pool.show_status);
pgbuf_Pool.show_status = NULL;
}
#if defined(SERVER_MODE)
pthread_mutex_destroy (&pgbuf_Pool.show_status_mutex);
#endif
thread_clear_all_holder_anchor ();
}
/*
* pgbuf_fix_with_retry () -
* return: Pointer to the page or NULL
* vpid(in): Complete Page identifier
* fetch_mode(in): Page fetch mode
* request_mode(in): Lock request_mode
* retry(in): Retry count
*/
PAGE_PTR
pgbuf_fix_with_retry (THREAD_ENTRY * thread_p, const VPID * vpid, PAGE_FETCH_MODE fetch_mode,
PGBUF_LATCH_MODE request_mode, int retry)
{
PAGE_PTR pgptr;
int i = 0;
bool noretry = false;
while ((pgptr = pgbuf_fix (thread_p, vpid, fetch_mode, request_mode, PGBUF_UNCONDITIONAL_LATCH)) == NULL)
{
switch (er_errid ())
{
case NO_ERROR: /* interrupt */
case ER_INTERRUPTED:
break;
case ER_LK_UNILATERALLY_ABORTED: /* timeout */
case ER_LK_PAGE_TIMEOUT:
case ER_PAGE_LATCH_TIMEDOUT:
i++;
break;
default:
noretry = true;
break;
}
if (noretry || i > retry)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_PAGE_LATCH_ABORTED, 2, vpid->volid, vpid->pageid);
break;
}
}
return pgptr;
}
/*
* pgbuf_fix () -
* return: Pointer to the page or NULL
* vpid(in): Complete Page identifier
* fetch_mode(in): Page fetch mode.
* request_mode(in): Page latch mode.
* condition(in): Page latch condition.
*/
#if !defined(NDEBUG)
PAGE_PTR
pgbuf_fix_debug (THREAD_ENTRY * thread_p, const VPID * vpid, PAGE_FETCH_MODE fetch_mode, PGBUF_LATCH_MODE request_mode,
PGBUF_LATCH_CONDITION condition, const char *caller_file, int caller_line, const char *caller_func)
#else /* NDEBUG */
PAGE_PTR
pgbuf_fix_release (THREAD_ENTRY * thread_p, const VPID * vpid, PAGE_FETCH_MODE fetch_mode,
PGBUF_LATCH_MODE request_mode, PGBUF_LATCH_CONDITION condition)
#endif /* NDEBUG */
{
PGBUF_BUFFER_HASH *hash_anchor;
PGBUF_BCB *bufptr;
PAGE_PTR pgptr;
int wait_msecs;
#if defined(ENABLE_SYSTEMTAP)
bool pgbuf_hit = false;
#endif /* ENABLE_SYSTEMTAP */
PGBUF_HOLDER *holder;
PGBUF_WATCHER *watcher;
bool buf_lock_acquired = false;
bool is_latch_wait = false;
bool retry = false;
#if !defined (NDEBUG)
bool had_holder = false;
#endif /* !NDEBUG */
PGBUF_FIX_PERF perf;
bool maybe_deallocated;
int tran_index = LOG_FIND_THREAD_TRAN_INDEX (thread_p);
PGBUF_STATUS *show_status = &pgbuf_Pool.show_status[tran_index];
perf.perf_page_found = PERF_PAGE_MODE_OLD_IN_BUFFER;
/* parameter validation */
if (request_mode != PGBUF_LATCH_READ && request_mode != PGBUF_LATCH_WRITE)
{
assert_release (false);
return NULL;
}
if (condition != PGBUF_UNCONDITIONAL_LATCH && condition != PGBUF_CONDITIONAL_LATCH)
{
assert_release (false);
return NULL;
}
pgbuf_Pool.monitor.fix_req_cnt.fetch_add (1, std::memory_order_relaxed);
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_FETCH) && fetch_mode != RECOVERY_PAGE)
{
/* Make sure that the page has been allocated (i.e., is a valid page) */
/* Suppress errors if fetch mode is OLD_PAGE_IF_IN_BUFFER. */
if (pgbuf_is_valid_page (thread_p, vpid, fetch_mode == OLD_PAGE_IF_IN_BUFFER) != DISK_VALID)
{
return NULL;
}
}
/* Do a simple check in non debugging mode */
if (vpid->pageid < 0)
{
er_set (ER_FATAL_ERROR_SEVERITY, ARG_FILE_LINE, ER_PB_BAD_PAGEID, 2, vpid->pageid,
fileio_get_volume_label (vpid->volid, PEEK));
return NULL;
}
if (condition == PGBUF_UNCONDITIONAL_LATCH)
{
/* Check the wait_msecs of current transaction. If the wait_msecs is zero wait that means no wait, change current
* request as a conditional request. */
wait_msecs = pgbuf_find_current_wait_msecs (thread_p);
if (wait_msecs == LK_ZERO_WAIT || wait_msecs == LK_FORCE_ZERO_WAIT)
{
condition = PGBUF_CONDITIONAL_LATCH;
}
}
perf.lock_wait_time = 0;
perf.is_perf_tracking = perfmon_is_perf_tracking ();
if (perf.is_perf_tracking)
{
tsc_getticks (&perf.start_tick);
}
try_again:
/* interrupt check */
if (logtb_get_check_interrupt (thread_p) == true)
{
if (logtb_is_interrupted (thread_p, true, &pgbuf_Pool.check_for_interrupts) == true)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_INTERRUPTED, 0);
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
return NULL;
}
}
/* Normal process */
/* latch_mode = PGBUF_LATCH_READ/PGBUF_LATCH_WRITE */
if (request_mode == PGBUF_LATCH_READ
&& (fetch_mode == OLD_PAGE || fetch_mode == OLD_PAGE_PREVENT_DEALLOC || fetch_mode == OLD_PAGE_MAYBE_DEALLOCATED)
&& condition == PGBUF_UNCONDITIONAL_LATCH)
{
pgptr = pgbuf_lockfree_fix_ro (thread_p, vpid, fetch_mode);
if (pgptr != NULL)
{
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
#if !defined (NDEBUG)
pgbuf_add_fixed_at (pgbuf_find_thrd_holder (thread_p, bufptr), caller_file, caller_line, !had_holder);
#endif
#if defined (ENABLE_SYSTEMTAP)
CUBRID_PGBUF_HIT ();
pgbuf_hit = true;
#endif /* ENABLE_SYSTEMTAP */
show_status->num_hit++;
goto fast_path;
}
}
hash_anchor = &pgbuf_Pool.buf_hash_table[PGBUF_HASH_VALUE (vpid)];
buf_lock_acquired = false;
bufptr = pgbuf_search_hash_chain (thread_p, hash_anchor, vpid);
if (bufptr != NULL && pgbuf_bcb_is_direct_victim (bufptr))
{
/* we need to notify the thread that is waiting for this bcb to victimize that it cannot use it. */
pgbuf_bcb_update_flags (thread_p, bufptr, PGBUF_BCB_INVALIDATE_DIRECT_VICTIM_FLAG, PGBUF_BCB_VICTIM_DIRECT_FLAG);
}
if (bufptr != NULL)
{
#if defined (ENABLE_SYSTEMTAP)
CUBRID_PGBUF_HIT ();
pgbuf_hit = true;
#endif /* ENABLE_SYSTEMTAP */
show_status->num_hit++;
if (fetch_mode == NEW_PAGE)
{
/* Fix a page as NEW_PAGE, when oldest_unflush_lsa of the page is not NULL_LSA, it should be dirty. */
assert (LSA_ISNULL (&bufptr->oldest_unflush_lsa) || pgbuf_bcb_is_dirty (bufptr));
/* The page may be invalidated and has been remained in the buffer and it is going to be used again as a new
* page. */
}
}
else if (fetch_mode == OLD_PAGE_IF_IN_BUFFER)
{
/* we don't need to fix page */
pthread_mutex_unlock (&hash_anchor->hash_mutex);
return NULL;
}
else
{
bufptr = pgbuf_claim_bcb_for_fix (thread_p, vpid, fetch_mode, hash_anchor, &perf, &retry, false);
if (bufptr == NULL)
{
if (retry)
{
retry = false;
goto try_again;
}
ASSERT_ERROR ();
return NULL;
}
buf_lock_acquired = true;
#if defined(ENABLE_SYSTEMTAP)
if (fetch_mode == NEW_PAGE && pgbuf_hit == false)
{
pgbuf_hit = true;
}
if (fetch_mode != NEW_PAGE)
{
CUBRID_PGBUF_MISS ();
}
#endif /* ENABLE_SYSTEMTAP */
}
assert (!pgbuf_bcb_is_direct_victim (bufptr));
/* At this place, the caller is holding bufptr->mutex */
pgbuf_bcb_register_fix (bufptr);
/* Set Page identifier if needed */
// Redo recovery may find an immature page which should be set.
pgbuf_set_bcb_page_vpid (bufptr);
maybe_deallocated = (fetch_mode == OLD_PAGE_MAYBE_DEALLOCATED);
if (pgbuf_check_bcb_page_vpid (bufptr, maybe_deallocated) != true)
{
if (buf_lock_acquired)
{
/* bufptr->mutex will be released in the following function. */
pgbuf_put_bcb_into_invalid_list (thread_p, bufptr);
/*
* Now, caller is not holding any mutex.
* the last argument of pgbuf_unlock_page () is true that
* means hash_mutex must be held before unlocking page.
*/
(void) pgbuf_unlock_page (thread_p, hash_anchor, vpid, true);
}
else
{
PGBUF_BCB_UNLOCK (bufptr);
}
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
return NULL;
}
if (fetch_mode == OLD_PAGE_PREVENT_DEALLOC)
{
pgbuf_bcb_register_avoid_deallocation (bufptr);
}
/* At this place, the caller is holding bufptr->mutex */
if (perf.is_perf_tracking)
{
tsc_getticks (&perf.start_holder_tick);
}
/* Latch Pass */
#if !defined (NDEBUG)
had_holder = pgbuf_find_thrd_holder (thread_p, bufptr) != NULL;
#endif /* NDEBUG */
if (pgbuf_latch_bcb_upon_fix (thread_p, bufptr, request_mode, buf_lock_acquired, condition, &is_latch_wait)
!= NO_ERROR)
{
/* bufptr->mutex has been released, error was set in the function, */
if (buf_lock_acquired)
{
/* hold bufptr->mutex again */
PGBUF_BCB_LOCK (bufptr);
/* bufptr->mutex will be released in the following function. */
pgbuf_put_bcb_into_invalid_list (thread_p, bufptr);
/*
* Now, caller is not holding any mutex.
* the last argument of pgbuf_unlock_page () is true that
* means hash_mutex must be held before unlocking page.
*/
(void) pgbuf_unlock_page (thread_p, hash_anchor, vpid, true);
}
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
return NULL;
}
#if !defined (NDEBUG)
pgbuf_add_fixed_at (pgbuf_find_thrd_holder (thread_p, bufptr), caller_file, caller_line, !had_holder);
#endif /* NDEBUG */
if (perf.is_perf_tracking && is_latch_wait)
{
tsc_getticks (&perf.end_tick);
tsc_elapsed_time_usec (&perf.tv_diff, perf.end_tick, perf.start_holder_tick);
perf.holder_wait_time = perf.tv_diff.tv_sec * 1000000LL + perf.tv_diff.tv_usec;
}
assert (bufptr == bufptr->iopage_buffer->bcb);
/* In case of NO_ERROR, bufptr->mutex has been released. */
/* Dirty Pages Table Registration Pass */
/* Currently, do nothing. Whenever the fixed page becomes dirty, oldest_unflush_lsa is set. */
/* Hash Chain Connection Pass */
if (buf_lock_acquired)
{
pgbuf_insert_into_hash_chain (thread_p, hash_anchor, bufptr);
/*
* the caller is holding hash_anchor->hash_mutex.
* Therefore, the third argument of pgbuf_unlock_page () is false
* that means hash mutex does not need to be held.
*/
(void) pgbuf_unlock_page (thread_p, hash_anchor, vpid, false);
}
CAST_BFPTR_TO_PGPTR (pgptr, bufptr);
fast_path:
#if !defined (NDEBUG)
assert (pgptr != NULL);
holder = pgbuf_get_holder (thread_p, pgptr);
assert (holder != NULL);
watcher = holder->last_watcher;
while (watcher != NULL)
{
assert (watcher->magic == PGBUF_WATCHER_MAGIC_NUMBER);
watcher = watcher->prev;
}
#endif
if (fetch_mode == OLD_PAGE_PREVENT_DEALLOC)
{
/* latch is obtained, no need for avoidance of dealloc */
pgbuf_bcb_unregister_avoid_deallocation (bufptr);
}
#if !defined (NDEBUG)
thread_p->get_pgbuf_tracker ().increment (caller_file, caller_line, pgptr);
#endif // !NDEBUG
if (bufptr->iopage_buffer->iopage.prv.ptype == PAGE_UNKNOWN)
{
/* deallocated page */
switch (fetch_mode)
{
case NEW_PAGE:
case OLD_PAGE_DEALLOCATED:
case OLD_PAGE_IF_IN_BUFFER:
case RECOVERY_PAGE:
/* fixing deallocated page is expected. fall through to return it. */
break;
case OLD_PAGE:
case OLD_PAGE_PREVENT_DEALLOC:
default:
/* caller does not expect any deallocated pages. this is an invalid page. */
assert (false);
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_PB_BAD_PAGEID, 2, vpid->pageid,
fileio_get_volume_label (vpid->volid, PEEK));
/* fall through to unfix */
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
pgbuf_unfix (thread_p, pgptr);
return NULL;
case OLD_PAGE_MAYBE_DEALLOCATED:
/* OLD_PAGE_MAYBE_DEALLOCATED is called when deallocated page may be fixed. The caller wants the page only if
* it is not deallocated. However, if it is deallocated, no error is required. */
er_set (ER_WARNING_SEVERITY, ARG_FILE_LINE, ER_PB_BAD_PAGEID, 2, vpid->pageid,
fileio_get_volume_label (vpid->volid, PEEK));
/* fall through to unfix */
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
pgbuf_unfix (thread_p, pgptr);
return NULL;
}
/* note: maybe we could check this in an earlier stage, but would have been a lot more complicated. the only
* interesting case here is OLD_PAGE_MAYBE_DEALLOCATED. However, even this is used in cases where the vast
* majority of pages will not be deallocated! So in terms of performance, the loss is insignificant.
* However, it is safer and easier to treat the case here, where we have latch to prevent concurrent
* deallocations. */
}
else
{
/* this cannot be a new page or a deallocated page.
* note: temporary pages are not strictly handled in regard with their deallocation status. */
assert (fetch_mode != NEW_PAGE || pgbuf_is_lsa_temporary (pgptr));
}
show_status->num_page_request++;
/* Record number of fetches in statistics */
if (perf.is_perf_tracking)
{
perf.perf_page_type = pgbuf_get_page_type_for_stat (thread_p, pgptr);
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_FETCHES);
if (request_mode == PGBUF_LATCH_READ)
{
perf.perf_latch_mode = PERF_HOLDER_LATCH_READ;
}
else
{
assert (request_mode == PGBUF_LATCH_WRITE);
perf.perf_latch_mode = PERF_HOLDER_LATCH_WRITE;
}
if (condition == PGBUF_UNCONDITIONAL_LATCH)
{
if (is_latch_wait)
{
perf.perf_cond_type = PERF_UNCONDITIONAL_FIX_WITH_WAIT;
if (perf.holder_wait_time > 0)
{
perfmon_pbx_hold_acquire_time (thread_p, perf.perf_page_type, perf.perf_page_found,
perf.perf_latch_mode, perf.holder_wait_time);
}
}
else
{
perf.perf_cond_type = PERF_UNCONDITIONAL_FIX_NO_WAIT;
}
}
else
{
perf.perf_cond_type = PERF_CONDITIONAL_FIX;
}
perfmon_pbx_fix (thread_p, perf.perf_page_type, perf.perf_page_found, perf.perf_latch_mode, perf.perf_cond_type);
if (perf.lock_wait_time > 0)
{
perfmon_pbx_lock_acquire_time (thread_p, perf.perf_page_type, perf.perf_page_found, perf.perf_latch_mode,
perf.perf_cond_type, perf.lock_wait_time);
}
tsc_getticks (&perf.end_tick);
tsc_elapsed_time_usec (&perf.tv_diff, perf.end_tick, perf.start_tick);
perf.fix_wait_time = perf.tv_diff.tv_sec * 1000000LL + perf.tv_diff.tv_usec;
if (perf.fix_wait_time > 0)
{
perfmon_pbx_fix_acquire_time (thread_p, perf.perf_page_type, perf.perf_page_found, perf.perf_latch_mode,
perf.perf_cond_type, perf.fix_wait_time);
perfmon_add_at_offset_to_local (thread_p, pstat_Metadata[PSTAT_PB_PAGE_FIX_ACQUIRE_TIME_10USEC].start_offset,
perf.fix_wait_time);
}
}
if (VACUUM_IS_THREAD_VACUUM_WORKER (thread_p))
{
pgbuf_bcb_update_flags (thread_p, bufptr, 0, PGBUF_BCB_TO_VACUUM_FLAG);
}
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
return pgptr;
}
/*
* pgbuf_simple_fix () - Copy a portion of a page to the given area
* return: area or NULL
* vpid(in): Complete Page identifier
*
* Note:
* WARNING:
* This is only for reading temporary file.
* if bcb is on buffer, only fcnt++. it is latchless and LRU mutexless.
* Even if it is a temporary file, it can be a problem if there is a write operation.
* Cannot be mixed with general FIX(LATCH).
*/
PAGE_PTR
pgbuf_simple_fix (THREAD_ENTRY * thread_p, const VPID * vpid, bool need_fix)
{
PGBUF_BUFFER_HASH *hash_anchor;
PGBUF_BCB *bufptr;
PAGE_PTR pgptr;
bool retry;
int th_lru_idx;
assert (pgbuf_is_temporary_volume (vpid->volid));
retry:
/* Is this a resident page ? */
hash_anchor = &(pgbuf_Pool.buf_hash_table[PGBUF_HASH_VALUE (vpid)]);
bufptr = pgbuf_search_hash_chain (thread_p, hash_anchor, vpid);
if (bufptr == NULL)
{
if (!need_fix || er_errid () == ER_CSS_PTHREAD_MUTEX_TRYLOCK)
{
pthread_mutex_unlock (&hash_anchor->hash_mutex);
return NULL;
}
if (pgbuf_lock_page (thread_p, hash_anchor, vpid) != PGBUF_LOCK_HOLDER)
{
/* retry */
goto retry;
}
bufptr = pgbuf_claim_bcb_for_fix (thread_p, vpid, OLD_PAGE, hash_anchor, NULL, &retry, true);
if (bufptr == NULL)
{
(void) pgbuf_unlock_page (thread_p, hash_anchor, vpid, true);
return NULL;
}
pgbuf_insert_into_hash_chain (thread_p, hash_anchor, bufptr);
(void) pgbuf_unlock_page (thread_p, hash_anchor, vpid, false);
add_fcnt (&bufptr->atomic_latch, 1);
CAST_BFPTR_TO_PGPTR (pgptr, bufptr);
/* add lru list. */
if (PGBUF_THREAD_HAS_PRIVATE_LRU (thread_p))
{
th_lru_idx = PGBUF_LRU_INDEX_FROM_PRIVATE (PGBUF_PRIVATE_LRU_FROM_THREAD (thread_p));
pgbuf_lru_add_new_bcb_to_top (thread_p, bufptr, th_lru_idx);
}
else
{
pgbuf_lru_add_new_bcb_to_middle (thread_p, bufptr, pgbuf_get_shared_lru_index_for_add ());
}
PGBUF_BCB_UNLOCK (bufptr);
}
else
{
if (need_fix)
{
/* we need to notify the thread that is waiting for this bcb to victimize that it cannot use it. */
if (pgbuf_bcb_is_direct_victim (bufptr))
{
pgbuf_bcb_update_flags (thread_p, bufptr, PGBUF_BCB_INVALIDATE_DIRECT_VICTIM_FLAG,
PGBUF_BCB_VICTIM_DIRECT_FLAG);
}
}
else
{
if (pgbuf_bcb_is_direct_victim (bufptr))
{
/* This BCB will be soon victimized and removed from the hash table. so it is considered not found. */
PGBUF_BCB_UNLOCK (bufptr);
return NULL;
}
}
/* the caller is holding only bufptr->mutex. */
CAST_BFPTR_TO_PGPTR (pgptr, bufptr);
add_fcnt (&bufptr->atomic_latch, 1);
/* release mutex */
PGBUF_BCB_UNLOCK (bufptr);
}
return pgptr;
}
/*
* pgbuf_simple_unfix () - Free the buffer where the page associated with pgptr resides
*
* Note:
* WARNING:
* This is only for reading temporary file.
* only fcnt--. it is latchless and LRU mutexless.
* Even if it is a temporary file, it can be a problem if there is a write operation.
*/
void
pgbuf_simple_unfix (THREAD_ENTRY * thread_p, PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
/* only decrease fcnt */
PGBUF_BCB_LOCK (bufptr);
add_fcnt (&bufptr->atomic_latch, -1);
PGBUF_BCB_UNLOCK (bufptr);
}
/*
* pgbuf_dealloc_temp_page () - invalidate page of temporary table
*
* Note:
* This is only for temporary file.
* init ptype and clear dirty.
*/
int
pgbuf_dealloc_temp_page (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, bool need_free)
{
PGBUF_BCB *bufptr;
/* invalidation task is performed */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
PGBUF_BCB_LOCK (bufptr);
/* set unknown type */
bufptr->iopage_buffer->iopage.prv.ptype = (unsigned char) PAGE_UNKNOWN;
/* clear page flags */
bufptr->iopage_buffer->iopage.prv.pflag = (unsigned char) 0;
/* clear dirty */
pgbuf_bcb_clear_dirty (thread_p, bufptr);
/* simple unfix */
if (need_free)
{
add_fcnt (&bufptr->atomic_latch, -1);
assert (get_fcnt (&bufptr->atomic_latch) == 0);
}
PGBUF_BCB_UNLOCK (bufptr);
return NO_ERROR;
}
/*
* pgbuf_promote_read_latch () - Promote read latch to write latch
* return: error code or NO_ERROR
* pgptr(in/out): page pointer
* condition(in): promotion condition (single reader holder/shared reader holder)
*/
#if !defined (NDEBUG)
int
pgbuf_promote_read_latch_debug (THREAD_ENTRY * thread_p, PAGE_PTR * pgptr_p, PGBUF_PROMOTE_CONDITION condition,
const char *caller_file, int caller_line, const char *caller_func)
#else /* NDEBUG */
int
pgbuf_promote_read_latch_release (THREAD_ENTRY * thread_p, PAGE_PTR * pgptr_p, PGBUF_PROMOTE_CONDITION condition)
#endif /* NDEBUG */
{
PGBUF_BCB *bufptr;
PGBUF_ATOMIC_LATCH_IMPL impl, impl_new;
#if defined(SERVER_MODE)
PGBUF_HOLDER *holder;
VPID vpid;
TSC_TICKS start_tick, end_tick;
TSCTIMEVAL tv_diff;
UINT64 promote_wait_time;
bool is_perf_tracking, need_block;
PERF_PAGE_TYPE perf_page_type = PERF_PAGE_UNKNOWN;
PERF_PROMOTE_CONDITION perf_promote_cond_type = PERF_PROMOTE_ONLY_READER;
PERF_HOLDER_LATCH perf_holder_latch = PERF_HOLDER_LATCH_READ;
PGBUF_HOLDER_STAT perf_stat;
int stat_success = 0;
int rv = NO_ERROR;
int fix_count;
#endif /* SERVER_MODE */
#if !defined (NDEBUG)
assert (pgptr_p != NULL);
assert (*pgptr_p != NULL);
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_FREE))
{
if (pgbuf_is_valid_page_ptr (*pgptr_p) == false)
{
return ER_FAILED;
}
}
#else /* !NDEBUG */
if (*pgptr_p == NULL)
{
return ER_FAILED;
}
#endif /* !NDEBUG */
/* fetch BCB from page pointer */
CAST_PGPTR_TO_BFPTR (bufptr, *pgptr_p);
#if defined(SERVER_MODE) /* SERVER_MODE */
vpid.pageid = bufptr->vpid.pageid;
vpid.volid = bufptr->vpid.volid;
/* performance tracking - get start counter */
is_perf_tracking = perfmon_is_perf_tracking ();
if (is_perf_tracking)
{
tsc_getticks (&start_tick);
}
PGBUF_BCB_LOCK (bufptr);
holder = pgbuf_find_thrd_holder (thread_p, bufptr);
assert_release (holder != NULL);
do
{
need_block = false;
impl = get_impl (&bufptr->atomic_latch);
impl_new = impl;
if (holder->fix_count == impl.impl.fcnt)
{
if (impl.impl.waiter_exists == true && bufptr->next_wait_thrd
&& bufptr->next_wait_thrd->wait_for_latch_promote)
{
PGBUF_BCB_UNLOCK (bufptr);
rv = ER_PAGE_LATCH_PROMOTE_FAIL;
#if !defined(NDEBUG)
er_set (ER_NOTIFICATION_SEVERITY, ARG_FILE_LINE, ER_PAGE_LATCH_PROMOTE_FAIL, 2, vpid.pageid, vpid.volid);
#endif
goto end;
}
else
{
/* we're the single holder of the read latch, do an in-place promotion */
impl_new.impl.latch_mode = PGBUF_LATCH_WRITE;
holder->perf_stat.hold_has_write_latch = 1;
/* NOTE: no need to set the promoted flag as long as we don't wait */
}
}
else
{
if ((condition == PGBUF_PROMOTE_ONLY_READER)
|| (bufptr->next_wait_thrd != NULL && bufptr->next_wait_thrd->wait_for_latch_promote))
{
/*
* CASE #1: first waiter is from a latch promotion - we can't
* guarantee both will see the same page they initially fixed so
* we'll abort the current promotion
* CASE #2: PGBUF_PROMOTE_ONLY_READER condition, we're only allowed
* to promote if we're the only reader; this is not the case
*/
PGBUF_BCB_UNLOCK (bufptr);
rv = ER_PAGE_LATCH_PROMOTE_FAIL;
#if !defined(NDEBUG)
er_set (ER_NOTIFICATION_SEVERITY, ARG_FILE_LINE, ER_PAGE_LATCH_PROMOTE_FAIL, 2, vpid.pageid, vpid.volid);
#endif
goto end;
}
else
{
fix_count = holder->fix_count;
perf_stat = holder->perf_stat;
if (impl.impl.fcnt == holder->fix_count)
{
continue;
}
impl_new.impl.fcnt -= fix_count;
impl_new.impl.waiter_exists = true;
need_block = true;
}
}
}
while (!bufptr->atomic_latch.compare_exchange_strong (impl.raw, impl_new.raw, std::memory_order_acq_rel,
std::memory_order_acquire));
if (!need_block)
{
PGBUF_BCB_UNLOCK (bufptr);
}
else
{
holder->fix_count = 0;
if (pgbuf_remove_thrd_holder (thread_p, holder) != NO_ERROR)
{
/* We unfixed the page, but failed to remove holder entry; consider the page as unfixed */
*pgptr_p = NULL;
/* shouldn't happen */
PGBUF_BCB_UNLOCK (bufptr);
assert_release (false);
return ER_FAILED;
}
holder = NULL;
/* NOTE: at this point the page is unfixed */
/* flag this thread as promoter */
thread_p->wait_for_latch_promote = true;
/* register as first blocker */
if (pgbuf_block_bcb (thread_p, bufptr, PGBUF_LATCH_WRITE, fix_count, true) != NO_ERROR)
{
*pgptr_p = NULL; /* we didn't get a new latch */
thread_p->wait_for_latch_promote = false;
return ER_FAILED;
}
/* NOTE: BCB mutex is no longer held at this point */
/* remove promote flag */
thread_p->wait_for_latch_promote = false;
/* new holder entry */
assert (pgbuf_find_thrd_holder (thread_p, bufptr) == NULL);
holder = pgbuf_allocate_thrd_holder_entry (thread_p);
if (holder == NULL)
{
/* We have new latch, but can't add a holder entry; consider the page as fixed */
/* This situation must not be occurred. */
assert_release (false);
return ER_FAILED;
}
holder->fix_count = fix_count;
holder->bufptr = bufptr;
holder->perf_stat = perf_stat;
if (get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_WRITE)
{
holder->perf_stat.hold_has_write_latch = 1;
}
else if (get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_READ)
{
holder->perf_stat.hold_has_read_latch = 1;
}
#if !defined(NDEBUG)
pgbuf_add_fixed_at (holder, caller_file, caller_line, true);
#endif /* NDEBUG */
}
end:
assert (rv == NO_ERROR || rv == ER_PAGE_LATCH_PROMOTE_FAIL);
/* performance tracking */
if (is_perf_tracking)
{
/* compute time */
tsc_getticks (&end_tick);
tsc_elapsed_time_usec (&tv_diff, end_tick, start_tick);
promote_wait_time = tv_diff.tv_sec * 1000000LL + tv_diff.tv_usec;
/* determine success or fail */
if (rv == NO_ERROR)
{
stat_success = 1;
}
/* aggregate success/fail */
perfmon_pbx_promote (thread_p, perf_page_type, perf_promote_cond_type, perf_holder_latch, stat_success,
promote_wait_time);
}
/* all successful */
return rv;
#else /* SERVER_MODE */
set_latch (&bufptr->atomic_latch, PGBUF_LATCH_WRITE);
return NO_ERROR;
#endif
}
/*
* pgbuf_unfix () - Free the buffer where the page associated with pgptr resides
* return: void
* pgptr(in): Pointer to page
*
* Note: The page is subject to replacement, if not fixed by other thread of execution.
*/
#if !defined (NDEBUG)
void
pgbuf_unfix_debug (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, const char *caller_file, int caller_line,
const char *caller_func)
#else /* NDEBUG */
void
pgbuf_unfix (THREAD_ENTRY * thread_p, PAGE_PTR pgptr)
#endif /* NDEBUG */
{
PGBUF_BCB *bufptr;
int holder_status;
PERF_HOLDER_LATCH perf_holder_latch;
PGBUF_HOLDER *holder;
PGBUF_WATCHER *watcher;
PGBUF_HOLDER_STAT holder_perf_stat;
PERF_PAGE_TYPE perf_page_type = PERF_PAGE_UNKNOWN;
bool is_perf_tracking;
#if defined(CUBRID_DEBUG)
LOG_LSA restart_lsa;
#endif /* CUBRID_DEBUG */
/* Get the address of the buffer from the page and free the buffer */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert (!VPID_ISNULL (&bufptr->vpid));
#if !defined (NDEBUG)
assert (pgptr != NULL);
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_FREE))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return;
}
}
holder = pgbuf_get_holder (thread_p, pgptr);
assert (holder != NULL);
watcher = holder->last_watcher;
while (watcher != NULL)
{
assert (watcher->magic == PGBUF_WATCHER_MAGIC_NUMBER);
watcher = watcher->prev;
}
#else /* !NDEBUG */
if (pgptr == NULL)
{
return;
}
#endif /* !NDEBUG */
#if defined(CUBRID_DEBUG)
/*
* If the buffer is dirty and the log sequence address of the buffer
* has not changed since the database restart, a warning is given about
* lack of logging
*/
if (pgbuf_bcb_is_dirty (bufptr) && !pgbuf_is_temp_lsa (bufptr->iopage_buffer->iopage.prv.lsa)
&& PGBUF_IS_AUXILIARY_VOLUME (bufptr->vpid.volid) == false
&& !log_is_logged_since_restart (&bufptr->iopage_buffer->iopage.prv.lsa))
{
er_log_debug (ARG_FILE_LINE,
"pgbuf_unfix: WARNING: No logging on dirty pageid = %d of Volume = %s.\n Recovery problems"
" may happen\n", bufptr->vpid.pageid, fileio_get_volume_label (bufptr->vpid.volid, PEEK));
/*
* Do not give warnings on this page any longer. Set the LSA of the
* buffer for this purposes
*/
pgbuf_set_lsa (thread_p, pgptr, log_get_restart_lsa ());
pgbuf_set_lsa (thread_p, pgptr, &restart_lsa);
LSA_COPY (&bufptr->oldest_unflush_lsa, &bufptr->iopage_buffer->iopage.prv.lsa);
}
/* Check for over runs */
if (memcmp (PGBUF_FIND_BUFFER_GUARD (bufptr), pgbuf_Guard, sizeof (pgbuf_Guard)) != 0)
{
er_log_debug (ARG_FILE_LINE, "pgbuf_unfix: SYSTEM ERROR buffer of pageid = %d|%d has been OVER RUN",
bufptr->vpid.volid, bufptr->vpid.pageid);
memcpy (PGBUF_FIND_BUFFER_GUARD (bufptr), pgbuf_Guard, sizeof (pgbuf_Guard));
}
/* Give a warning if the page is not consistent */
if (get_fcnt (&bufptr->atomic_latch) <= 0)
{
er_log_debug (ARG_FILE_LINE,
"pgbuf_unfix: SYSTEM ERROR Freeing too much buffer of pageid = %d of Volume = %s\n",
bufptr->vpid.pageid, fileio_get_volume_label (bufptr->vpid.volid, PEEK));
}
#endif /* CUBRID_DEBUG */
is_perf_tracking = perfmon_is_perf_tracking ();
if (is_perf_tracking)
{
perf_page_type = pgbuf_get_page_type_for_stat (thread_p, pgptr);
}
INIT_HOLDER_STAT (&holder_perf_stat);
holder_status = pgbuf_unlatch_thrd_holder (thread_p, bufptr, &holder_perf_stat);
assert (holder_perf_stat.hold_has_write_latch == 1 || holder_perf_stat.hold_has_read_latch == 1);
if (is_perf_tracking)
{
if (holder_perf_stat.hold_has_read_latch && holder_perf_stat.hold_has_write_latch)
{
perf_holder_latch = PERF_HOLDER_LATCH_MIXED;
}
else if (holder_perf_stat.hold_has_read_latch)
{
perf_holder_latch = PERF_HOLDER_LATCH_READ;
}
else
{
assert (holder_perf_stat.hold_has_write_latch);
perf_holder_latch = PERF_HOLDER_LATCH_WRITE;
}
perfmon_pbx_unfix (thread_p, perf_page_type, holder_perf_stat.dirty_before_hold,
holder_perf_stat.dirtied_by_holder, perf_holder_latch);
}
/* if read latch exists,... */
if (pgbuf_lockfree_unfix_ro (thread_p, bufptr))
{
return;
}
PGBUF_BCB_LOCK (bufptr);
#if !defined (NDEBUG)
thread_p->get_pgbuf_tracker ().decrement (pgptr);
#endif // !NDEBUG
(void) pgbuf_unlatch_bcb_upon_unfix (thread_p, bufptr, holder_status);
/* bufptr->mutex has been released in above function. */
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
#if defined(CUBRID_DEBUG)
/*
* CONSISTENCIES AND SCRAMBLES
* You may want to tailor the following debugging block
* since its operations and their implications are very expensive.
* Too much I/O
*/
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
/*
* Check if the content of the page is consistent and then scramble
* the page to detect illegal access to the page in the future.
*/
PGBUF_BCB_LOCK (bufptr);
if (get_fcnt (&bufptr->atomic_latch) == 0)
{
/* Check for consistency */
if (!VPID_ISNULL (&bufptr->vpid) && pgbuf_is_consistent (bufptr, 0) == PGBUF_CONTENT_BAD)
{
er_log_debug (ARG_FILE_LINE, "pgbuf_unfix: WARNING Pageid = %d|%d seems inconsistent",
bufptr->vpid.volid, bufptr->vpid.pageid);
/* some problems in the consistency of the given buffer page */
pgbuf_dump ();
}
else
{
/* the given buffer page is consistent */
/* Flush the page if it is dirty */
if (pgbuf_bcb_is_dirty (bufptr))
{
/* flush the page with PGBUF_LATCH_FLUSH mode */
(void) pgbuf_bcb_safe_flush_force_unlock (thread_p, bufptr, true);
/*
* Since above function releases bufptr->mutex,
* the caller must hold bufptr->mutex again.
*/
PGBUF_BCB_LOCK (bufptr);
}
/*
* If the buffer is associated with a page (i.e., if the buffer
* is not used as a working area --malloc--), invalidate the
* page on this buffer.
* Detach the buffer area or scramble tha area.
*/
if (!VPID_ISNULL (&bufptr->vpid))
{
/* invalidate the page with PGBUF_LATCH_INVALID mode */
(void) pgbuf_invalidate_bcb (thread_p, bufptr);
/*
* Since above function releases mutex after flushing,
* the caller must hold bufptr->mutex again.
*/
PGBUF_BCB_LOCK (bufptr);
}
pgbuf_scramble (&bufptr->iopage_buffer->iopage);
/*
* Note that the buffer is not declared for immediate
* replacement.
* wait for a while to see if an invalid access is found.
*/
}
}
PGBUF_BCB_UNLOCK (bufptr);
}
#endif /* CUBRID_DEBUG */
}
/*
* pgbuf_unfix_all () - Unfixes all the buffers that have been fixed by current
* thread at the time of request termination
* return: void
*
* Note: At the time of request termination, there must
* be no buffers that were fixed by the thread. In current CUBRID
* system, however, above situation has occurred. In some later time,
* our system must be corrected to prevent above situation from
* occurring.
*/
void
pgbuf_unfix_all (THREAD_ENTRY * thread_p)
{
int thrd_index;
PAGE_PTR pgptr;
PGBUF_HOLDER_ANCHOR *thrd_holder_info;
PGBUF_HOLDER *holder;
#if defined(NDEBUG)
#else /* NDEBUG */
PGBUF_BCB *bufptr;
#if defined(CUBRID_DEBUG)
int consistent;
#endif /* CUBRID_DEBUG */
const char *latch_mode_str, *zone_str, *consistent_str;
#endif /* NDEBUG */
thrd_index = thread_get_entry_index (thread_p);
thrd_holder_info = &(pgbuf_Pool.thrd_holder_info[thrd_index]);
if (thrd_holder_info->num_hold_cnt > 0)
{
/* For each BCB holder entry of thread's holder list */
holder = thrd_holder_info->thrd_hold_list;
while (holder != NULL)
{
assert (false);
CAST_BFPTR_TO_PGPTR (pgptr, holder->bufptr);
#if defined(NDEBUG)
pgbuf_unfix_and_init (thread_p, pgptr);
/* Within the execution of pgbuf_unfix(), the BCB holder entry is moved from the holder list of BCB to the
* free holder list of thread, and the BCB holder entry is removed from the holder list of the thread. */
holder = thrd_holder_info->thrd_hold_list;
#else /* NDEBUG */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert (!VPID_ISNULL (&bufptr->vpid));
latch_mode_str = pgbuf_latch_mode_str (get_latch (&bufptr->atomic_latch));
zone_str = pgbuf_zone_str (pgbuf_bcb_get_zone (bufptr));
/* check if the content of current buffer page is consistent. */
#if defined(CUBRID_DEBUG)
consistent = pgbuf_is_consistent (bufptr, 0);
consistenet_str = pgbuf_consistent_str (consistent);
#else /* CUBRID_DEBUG */
consistent_str = "UNKNOWN";
#endif /* CUBRID_DEBUG */
er_log_debug (ARG_FILE_LINE,
"pgbuf_unfix_all: WARNING %4d %5d %6d %4d %9s %1d %1d %1d %11s %6d|%4d %10s %p %p-%p\n",
pgbuf_bcb_get_pool_index (bufptr), bufptr->vpid.volid, bufptr->vpid.pageid,
get_fcnt (&bufptr->atomic_latch), latch_mode_str, (int) pgbuf_bcb_is_dirty (bufptr),
(int) pgbuf_bcb_is_flushing (bufptr), (int) pgbuf_bcb_is_async_flush_request (bufptr), zone_str,
LSA_AS_ARGS (&bufptr->iopage_buffer->iopage.prv.lsa), consistent_str, (void *) bufptr,
(void *) (&bufptr->iopage_buffer->iopage.page[0]),
(void *) (&bufptr->iopage_buffer->iopage.page[DB_PAGESIZE - 1]));
holder = holder->thrd_link;
#endif /* NDEBUG */
}
}
}
/*
* pgbuf_invalidate () - Invalidate page in buffer
* return: NO_ERROR, or ER_code
* pgptr(in): Pointer to page
*
* Note: Invalidate the buffer corresponding to page associated with pgptr when
* the page has been fixed only once, otherwise, the page is only
* unfixed. If the page is invalidated, the page will not be associated
* with the buffer any longer and the buffer can be used for the buffer
* allocation immediately.
*
* The page invalidation task is executed only for performance
* enhancement. This task is irrespective of correctness. That is, If
* this task is not performed, there is no problem in the correctness of
* the system. When page invalidation task is used, however, following
* things must be kept to prevent incorrectness incurred by using page
* invalidation task.
*
* 1. For temporary pages, page invalidation can be performed at any
* time.
* 2. For regular pages(used to save persistent data such as meta data
* and user data), page invalidation must be performed as postpone
* operation that is executed after the commit decision of transaction
* has been made. The reason will be explained in the
* document[TM-2001-04].
*/
#if !defined(NDEBUG)
int
pgbuf_invalidate_debug (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, const char *caller_file, int caller_line)
#else /* NDEBUG */
int
pgbuf_invalidate (THREAD_ENTRY * thread_p, PAGE_PTR pgptr)
#endif /* NDEBUG */
{
PGBUF_BCB *bufptr;
VPID temp_vpid;
int holder_status;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return ER_FAILED;
}
}
/* Get the address of the buffer from the page and invalidate buffer */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert (!VPID_ISNULL (&bufptr->vpid));
PGBUF_BCB_LOCK (bufptr);
/*
* This function is called by the caller while it is fixing the page
* with PGBUF_LATCH_WRITE mode in CUBRID environment. Therefore,
* the caller must unfix the page and then invalidate the page.
*/
if (get_fcnt (&bufptr->atomic_latch) > 1)
{
holder_status = pgbuf_unlatch_thrd_holder (thread_p, bufptr, NULL);
#if !defined (NDEBUG)
thread_p->get_pgbuf_tracker ().decrement (pgptr);
#endif // !NDEBUG
/* If the page has been fixed more than one time, just unfix it. */
/* todo: is this really safe? */
if (pgbuf_unlatch_bcb_upon_unfix (thread_p, bufptr, holder_status) != NO_ERROR)
{
return ER_FAILED;
}
return NO_ERROR;
/* bufptr->mutex hash been released in above function. */
}
/* bufptr->fcnt == 1 */
/* Currently, atomic_latch latch_mode is PGBUF_LATCH_WRITE */
if (pgbuf_bcb_safe_flush_force_lock (thread_p, bufptr, true) != NO_ERROR)
{
ASSERT_ERROR ();
return ER_FAILED;
}
/* save the pageid of the page temporarily. */
temp_vpid = bufptr->vpid;
holder_status = pgbuf_unlatch_thrd_holder (thread_p, bufptr, NULL);
#if !defined (NDEBUG)
thread_p->get_pgbuf_tracker ().decrement (pgptr);
#endif // !NDEBUG
if (pgbuf_unlatch_bcb_upon_unfix (thread_p, bufptr, holder_status) != NO_ERROR)
{
return ER_FAILED;
}
/* bufptr->mutex has been released in above function. */
/* hold mutex again to invalidate the BCB */
PGBUF_BCB_LOCK (bufptr);
/* check if the page should be invalidated. */
if (VPID_ISNULL (&bufptr->vpid) || !VPID_EQ (&temp_vpid, &bufptr->vpid) || get_fcnt (&bufptr->atomic_latch) > 0
|| pgbuf_bcb_avoid_victim (bufptr))
{
PGBUF_BCB_UNLOCK (bufptr);
return NO_ERROR;
}
#if defined(CUBRID_DEBUG)
pgbuf_scramble (&bufptr->iopage_buffer->iopage);
#endif /* CUBRID_DEBUG */
/* Now, invalidation task is performed after holding a page latch with PGBUF_LATCH_INVALID mode. */
if (pgbuf_invalidate_bcb (thread_p, bufptr) != NO_ERROR)
{
return ER_FAILED;
}
/* bufptr->mutex has been released in above function. */
return NO_ERROR;
}
/*
* pgbuf_invalidate_all () - Invalidate all unfixed buffers corresponding to the given volume
* return: NO_ERROR, or ER_code
* volid(in): Permanent Volume Identifier or NULL_VOLID
*
* Note: The pages in these buffers are disassociated from the buffers.
* If a page was dirty, it is flushed before the buffer is invalidated.
*/
#if !defined(NDEBUG)
int
pgbuf_invalidate_all_debug (THREAD_ENTRY * thread_p, VOLID volid, const char *caller_file, int caller_line)
#else /* NDEBUG */
int
pgbuf_invalidate_all (THREAD_ENTRY * thread_p, VOLID volid)
#endif /* NDEBUG */
{
PGBUF_BCB *bufptr;
VPID temp_vpid;
int bufid;
/*
* While searching all the buffer pages or corresponding buffer pages,
* the caller flushes each buffer page if it is dirty and
* invalidates the buffer page if it is not fixed on the buffer.
*/
for (bufid = 0; bufid < pgbuf_Pool.num_buffers; bufid++)
{
bufptr = PGBUF_FIND_BCB_PTR (bufid);
if (VPID_ISNULL (&bufptr->vpid) || (volid != NULL_VOLID && volid != bufptr->vpid.volid))
{
continue;
}
PGBUF_BCB_LOCK (bufptr);
if (VPID_ISNULL (&bufptr->vpid) || (volid != NULL_VOLID && volid != bufptr->vpid.volid)
|| get_fcnt (&bufptr->atomic_latch) > 0)
{
/* PGBUF_LATCH_READ/PGBUF_LATCH_WRITE */
PGBUF_BCB_UNLOCK (bufptr);
continue;
}
if (pgbuf_bcb_is_dirty (bufptr))
{
temp_vpid = bufptr->vpid;
if (pgbuf_bcb_safe_flush_force_lock (thread_p, bufptr, true) != NO_ERROR)
{
return ER_FAILED;
}
/* check if page invalidation should be performed on the page */
if (VPID_ISNULL (&bufptr->vpid) || !VPID_EQ (&temp_vpid, &bufptr->vpid)
|| (volid != NULL_VOLID && volid != bufptr->vpid.volid) || get_fcnt (&bufptr->atomic_latch) > 0)
{
PGBUF_BCB_UNLOCK (bufptr);
continue;
}
}
if (pgbuf_bcb_avoid_victim (bufptr))
{
PGBUF_BCB_UNLOCK (bufptr);
continue;
}
#if defined(CUBRID_DEBUG)
pgbuf_scramble (&bufptr->iopage_buffer->iopage);
#endif /* CUBRID_DEBUG */
/* Now, page invalidation task is performed while holding a page latch with PGBUF_LATCH_INVALID mode. */
(void) pgbuf_invalidate_bcb (thread_p, bufptr);
/* bufptr->mutex has been released in above function. */
}
return NO_ERROR;
}
/*
* pgbuf_flush () - Flush a page out to disk
* return: pgptr on success, NULL on failure
* pgptr(in): Page pointer
* free_page(in): Free the page too ?
*
* Note: The page associated with pgptr is written out to disk (ONLY when the
* page is dirty) and optionally is freed (See pb_free). The interface
* requires the pgptr instead of vpid to avoid hashing.
*
* The page flush task is also executed only for performance enhancement
* like page invalidation task. And, this task can be performed at any
* time unlike page invalidation task.
*/
void
pgbuf_flush (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, bool free_page)
{
/* caller flushes page but does not really care if page really makes it to disk. or doesn't know what to do in that
* case... I recommend against using it. */
if (pgbuf_flush_with_wal (thread_p, pgptr) == NULL)
{
ASSERT_ERROR ();
}
if (free_page == FREE)
{
pgbuf_unfix (thread_p, pgptr);
}
}
/*
* pgbuf_flush_with_wal () - Flush a page out to disk after following the wal rule
* return: pgptr on success, NULL on failure
* pgptr(in): Page pointer
*
* Note: The page associated with pgptr is written out to disk (ONLY when the page is dirty)
* Before the page is flushed, the WAL rule of the log manager is called.
*/
PAGE_PTR
pgbuf_flush_with_wal (THREAD_ENTRY * thread_p, PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return NULL;
}
}
/* NOTE: the page is fixed */
/* Get the address of the buffer from the page. */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert (!VPID_ISNULL (&bufptr->vpid));
/* In CUBRID, the caller is holding WRITE page latch */
assert (get_latch (&bufptr->atomic_latch) >= PGBUF_LATCH_READ && pgbuf_find_thrd_holder (thread_p, bufptr) != NULL);
PGBUF_BCB_LOCK (bufptr);
/* Flush the page only when it is dirty */
if (pgbuf_bcb_safe_flush_force_unlock (thread_p, bufptr, true) != NO_ERROR)
{
ASSERT_ERROR ();
return NULL;
}
return pgptr;
}
/*
* pgbuf_flush_if_requested () - flush page if needed. this function is used for permanently latched pages. the thread
* holding should periodically check if flush is requested (usually by checkpoint thread).
*
* return : void
* thread_p (in) : thread entry
* page (in) : page
*/
void
pgbuf_flush_if_requested (THREAD_ENTRY * thread_p, PAGE_PTR page)
{
PGBUF_BCB *bcb;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (page) == false)
{
assert (false);
return;
}
}
/* NOTE: the page is fixed */
/* Get the address of the buffer from the page. */
CAST_PGPTR_TO_BFPTR (bcb, page);
assert (!VPID_ISNULL (&bcb->vpid));
/* caller should have write latch, otherwise there is no point in calling this function */
assert (get_latch (&bcb->atomic_latch) == PGBUF_LATCH_WRITE && pgbuf_find_thrd_holder (thread_p, bcb) != NULL);
if (pgbuf_bcb_is_async_flush_request (bcb))
{
PGBUF_BCB_LOCK (bcb);
if (pgbuf_bcb_safe_flush_force_unlock (thread_p, bcb, false) != NO_ERROR)
{
assert (false);
}
}
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
}
static int
pgbuf_flush_all_helper (THREAD_ENTRY * thread_p, VOLID volid, bool is_unfixed_only, bool is_set_lsa_as_null)
{
PGBUF_BCB *bufptr;
int i, ret = NO_ERROR;
/* Flush all unfixed dirty buffers */
for (i = 0; i < pgbuf_Pool.num_buffers; i++)
{
bufptr = PGBUF_FIND_BCB_PTR (i);
if (!pgbuf_bcb_is_dirty (bufptr) || (volid != NULL_VOLID && volid != bufptr->vpid.volid))
{
continue;
}
PGBUF_BCB_LOCK (bufptr);
/* flush condition check */
if (!pgbuf_bcb_is_dirty (bufptr) || (is_unfixed_only && get_fcnt (&bufptr->atomic_latch) > 0)
|| (volid != NULL_VOLID && volid != bufptr->vpid.volid))
{
PGBUF_BCB_UNLOCK (bufptr);
continue;
}
if (is_set_lsa_as_null)
{
/* set PageLSA as NULL value */
fileio_init_lsa_of_page (&bufptr->iopage_buffer->iopage, IO_PAGESIZE);
}
/* flush */
if (pgbuf_bcb_safe_flush_force_unlock (thread_p, bufptr, true) != NO_ERROR)
{
/* best efforts */
assert (false);
ret = ER_FAILED;
}
/* Above function released mutex regardless of its return value. */
}
return ret;
}
/*
* pgbuf_flush_all () - Flush all dirty pages out to disk
* return: NO_ERROR, or ER_code
* volid(in): Permanent Volume Identifier or NULL_VOLID
*
* Note: Every dirty page of the specified volume is written out to disk.
* If volid is equal to NULL_VOLID, all dirty pages of all volumes are
* written out to disk. Its use is recommended by only the log and
* recovery manager.
*/
int
pgbuf_flush_all (THREAD_ENTRY * thread_p, VOLID volid)
{
return pgbuf_flush_all_helper (thread_p, volid, false, false);
}
/*
* pgbuf_flush_all_unfixed () - Flush all unfixed dirty pages out to disk
* return: NO_ERROR, or ER_code
* volid(in): Permanent Volume Identifier or NULL_VOLID
*
* Note: Every dirty page of the specified volume which is unfixed is written
* out to disk. If volid is equal to NULL_VOLID, all dirty pages of all
* volumes that are unfixed are written out to disk.
* Its use is recommended by only the log and recovery manager.
*/
int
pgbuf_flush_all_unfixed (THREAD_ENTRY * thread_p, VOLID volid)
{
return pgbuf_flush_all_helper (thread_p, volid, true, false);
}
/*
* pgbuf_flush_all_unfixed_and_set_lsa_as_null () - Set lsa to null and flush all unfixed dirty pages out to disk
* return: NO_ERROR, or ER_code
* volid(in): Permanent Volume Identifier or NULL_VOLID
*
* Note: Every dirty page of the specified volume which is unfixed is written
* out after its lsa is initialized to a null lsa. If volid is equal to
* NULL_VOLID, all dirty pages of all volumes that are unfixed are
* flushed to disk after its lsa is initialized to null.
* Its use is recommended by only the log and recovery manager.
*/
int
pgbuf_flush_all_unfixed_and_set_lsa_as_null (THREAD_ENTRY * thread_p, VOLID volid)
{
return pgbuf_flush_all_helper (thread_p, volid, true, true);
}
/*
* pgbuf_compare_victim_list () - Compare the vpid of victim candidate list
* return: p1 - p2
* p1(in): victim candidate list 1
* p2(in): victim candidate list 2
*/
static int
pgbuf_compare_victim_list (const void *p1, const void *p2)
{
PGBUF_VICTIM_CANDIDATE_LIST *node1, *node2;
int diff;
node1 = (PGBUF_VICTIM_CANDIDATE_LIST *) p1;
node2 = (PGBUF_VICTIM_CANDIDATE_LIST *) p2;
diff = node1->vpid.volid - node2->vpid.volid;
if (diff != 0)
{
return diff;
}
else
{
return (node1->vpid.pageid - node2->vpid.pageid);
}
}
/*
* pgbuf_get_victim_candidates_from_lru () - get victim candidates from LRU list
* return : number of victims found
* thread_p (in) : thread entry
* check_count (in) : number of items to verify before abandoning search
* flush_ratio (in) : flush ratio
* assigned_directly (out) : output true if a bcb was assigned directly.
*/
static int
pgbuf_get_victim_candidates_from_lru (THREAD_ENTRY * thread_p, int check_count, float lru_sum_flush_priority,
bool * assigned_directly)
{
int lru_idx, victim_cand_count, i;
PGBUF_BCB *bufptr;
int check_count_this_lru;
float victim_flush_priority_this_lru;
int count_checked_lists = 0;
#if defined (SERVER_MODE)
/* as part of handling a rare case when there are rare direct victim waiters although there are plenty victims, flush
* thread assigns one bcb per iteration directly. this will add only a little overhead in general cases. */
bool try_direct_assign = true;
#endif /* SERVER_MODE */
/* init */
victim_cand_count = 0;
for (lru_idx = 0; lru_idx < PGBUF_TOTAL_LRU_COUNT; lru_idx++)
{
victim_flush_priority_this_lru = pgbuf_Pool.quota.lru_victim_flush_priority_per_lru[lru_idx];
if (victim_flush_priority_this_lru <= 0)
{
/* no target for this list. */
continue;
}
++count_checked_lists;
check_count_this_lru = (int) (victim_flush_priority_this_lru * (float) check_count / lru_sum_flush_priority);
check_count_this_lru = MAX (check_count_this_lru, 1);
i = check_count_this_lru;
(void) pthread_mutex_lock (&pgbuf_Pool.buf_LRU_list[lru_idx].mutex);
for (bufptr = pgbuf_Pool.buf_LRU_list[lru_idx].bottom;
bufptr != NULL && PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (bufptr) && i > 0; bufptr = bufptr->prev_BCB, i--)
{
if (pgbuf_bcb_is_dirty (bufptr))
{
/* save victim candidate information temporarily. */
pgbuf_Pool.victim_cand_list[victim_cand_count].bufptr = bufptr;
pgbuf_Pool.victim_cand_list[victim_cand_count].vpid = bufptr->vpid;
victim_cand_count++;
}
#if defined (SERVER_MODE)
else if (try_direct_assign && pgbuf_is_any_thread_waiting_for_direct_victim ()
&& pgbuf_is_bcb_victimizable (bufptr, false) && PGBUF_BCB_TRYLOCK (bufptr) == 0)
{
if (pgbuf_is_bcb_victimizable (bufptr, true) && pgbuf_assign_direct_victim (thread_p, bufptr))
{
/* assigned directly. don't try any other. */
try_direct_assign = false;
*assigned_directly = true;
perfmon_inc_stat (thread_p, PSTAT_PB_VICTIM_ASSIGN_DIRECT_SEARCH_FOR_FLUSH);
}
PGBUF_BCB_UNLOCK (bufptr);
}
#endif /* SERVER_MODE */
}
pthread_mutex_unlock (&pgbuf_Pool.buf_LRU_list[lru_idx].mutex);
}
if (prm_get_bool_value (PRM_ID_LOG_PGBUF_VICTIM_FLUSH))
{
_er_log_debug (ARG_FILE_LINE,
"pgbuf_flush_victim_candidates: pgbuf_get_victim_candidates_from_lru %d candidates in %d lists \n",
victim_cand_count, count_checked_lists);
}
return victim_cand_count;
}
/*
* pgbuf_flush_victim_candidates () - collect & flush victim candidates
*
* return : error code
* thread_p (in) : thread entry
* flush_ratio (in) : desired flush ratio
* perf_tracker (in/out) : time tracker for performance statistics
* stop (out) : output to stop looping
*/
int
pgbuf_flush_victim_candidates (THREAD_ENTRY * thread_p, float flush_ratio, PERF_UTIME_TRACKER * perf_tracker,
bool * stop)
{
PGBUF_BCB *bufptr;
PGBUF_VICTIM_CANDIDATE_LIST *victim_cand_list;
int i, victim_count = 0;
int check_count_lru;
int cfg_check_cnt;
int total_flushed_count;
int error = NO_ERROR;
float lru_miss_rate;
float lru_dynamic_flush_adj = 1.0f;
int lru_victim_req_cnt, fix_req_cnt;
float lru_sum_flush_priority;
int count_need_wal = 0;
LOG_LSA lsa_need_wal = LSA_INITIALIZER;
#if defined(SERVER_MODE)
LOG_LSA save_lsa_need_wal = LSA_INITIALIZER;
static THREAD_ENTRY *page_flush_thread = NULL;
bool repeated = false;
#endif /* SERVER_MODE */
bool is_bcb_locked = false;
bool detailed_perf = perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION);
bool assigned_directly = false;
#if !defined (NDEBUG) && defined (SERVER_MODE)
bool empty_flushed_bcb_queue = false;
bool direct_victim_waiters = false;
#endif /* DEBUG && SERVER_MODE */
// stats
UINT64 num_skipped_already_flushed = 0;
UINT64 num_skipped_fixed_or_hot = 0;
UINT64 num_skipped_need_wal = 0;
UINT64 num_skipped_flush = 0;
bool logging = prm_get_bool_value (PRM_ID_LOG_PGBUF_VICTIM_FLUSH);
er_set (ER_NOTIFICATION_SEVERITY, ARG_FILE_LINE, ER_LOG_FLUSH_VICTIM_STARTED, 0);
if (logging)
{
_er_log_debug (ARG_FILE_LINE, "pgbuf_flush_victim_candidates: start flush victim candidates\n");
}
#if !defined(NDEBUG) && defined(SERVER_MODE)
if (pgbuf_is_page_flush_daemon_available ())
{
if (page_flush_thread == NULL)
{
page_flush_thread = thread_p;
}
/* This should be fixed */
assert (page_flush_thread == thread_p);
}
#endif
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
*stop = false;
pgbuf_compute_lru_vict_target (&lru_sum_flush_priority);
victim_cand_list = pgbuf_Pool.victim_cand_list;
victim_count = 0;
total_flushed_count = 0;
check_count_lru = 0;
lru_victim_req_cnt = ATOMIC_TAS_32 (&pgbuf_Pool.monitor.lru_victim_req_cnt, 0);
fix_req_cnt = pgbuf_Pool.monitor.fix_req_cnt.exchange (0, std::memory_order_seq_cst);
if (fix_req_cnt > lru_victim_req_cnt)
{
lru_miss_rate = (float) lru_victim_req_cnt / (float) fix_req_cnt;
}
else
{
/* overflow of fix counter, we ignore miss rate */
lru_miss_rate = 0;
}
cfg_check_cnt = (int) (pgbuf_Pool.num_buffers * flush_ratio);
/* Victims will only be flushed, not decached. */
#if defined (SERVER_MODE)
/* do not apply flush boost during checkpoint; since checkpoint is already flushing pages we expect some of the victim
* candidates are already flushed by checkpoint */
if (pgbuf_Pool.is_checkpoint == false)
{
lru_dynamic_flush_adj = MAX (1.0f, 1 + (PGBUF_FLUSH_VICTIM_BOOST_MULT - 1) * lru_miss_rate);
lru_dynamic_flush_adj = MIN (PGBUF_FLUSH_VICTIM_BOOST_MULT, lru_dynamic_flush_adj);
}
else
#endif
{
lru_dynamic_flush_adj = 1.0f;
}
check_count_lru = (int) (cfg_check_cnt * lru_dynamic_flush_adj);
/* limit the checked BCBs to equivalent of 200 M */
check_count_lru = MIN (check_count_lru, (200 * 1024 * 1024) / DB_PAGESIZE);
#if !defined (NDEBUG) && defined (SERVER_MODE)
empty_flushed_bcb_queue = pgbuf_Pool.flushed_bcbs->is_empty ();
direct_victim_waiters = pgbuf_is_any_thread_waiting_for_direct_victim ();
#endif /* DEBUG && SERVER_MODE */
if (check_count_lru > 0 && lru_sum_flush_priority > 0)
{
victim_count =
pgbuf_get_victim_candidates_from_lru (thread_p, check_count_lru, lru_sum_flush_priority, &assigned_directly);
}
if (victim_count == 0)
{
/* We didn't find any victims */
PERF_UTIME_TRACKER_TIME_AND_RESTART (thread_p, perf_tracker, PSTAT_PB_FLUSH_COLLECT);
/* if pgbuf_get_victim_candidates_from_lru failed to provide candidates, it means we already flushed enough.
* give threads looking for victims a chance to find them before looping again. output hint to stop looping. */
*stop = check_count_lru > 0 && lru_sum_flush_priority > 0;
goto end;
}
#if defined (SERVER_MODE)
/* wake up log flush thread. we need log up to date to be able to flush pages */
if (log_is_log_flush_daemon_available ())
{
log_wakeup_log_flush_daemon ();
}
else
#endif /* SERVER_MODE */
{
logpb_force_flush_pages (thread_p);
}
if (prm_get_bool_value (PRM_ID_PB_SEQUENTIAL_VICTIM_FLUSH) == true)
{
qsort ((void *) victim_cand_list, victim_count, sizeof (PGBUF_VICTIM_CANDIDATE_LIST), pgbuf_compare_victim_list);
}
#if defined (SERVER_MODE)
pgbuf_Pool.is_flushing_victims = true;
#endif
if (logging)
{
_er_log_debug (ARG_FILE_LINE, "pgbuf_flush_victim_candidates: start flushing collected victim candidates\n");
}
if (perf_tracker->is_perf_tracking)
{
UINT64 utime;
tsc_getticks (&perf_tracker->end_tick);
utime = tsc_elapsed_utime (perf_tracker->end_tick, perf_tracker->start_tick);
perfmon_time_stat (thread_p, PSTAT_PB_FLUSH_COLLECT, utime);
if (detailed_perf)
{
perfmon_time_bulk_stat (thread_p, PSTAT_PB_FLUSH_COLLECT_PER_PAGE, utime, victim_count);
}
perf_tracker->start_tick = perf_tracker->end_tick;
}
#if defined (SERVER_MODE)
repeat:
#endif
count_need_wal = 0;
/* temporary disable second iteration */
/* for each victim candidate, do flush task */
for (i = 0; i < victim_count; i++)
{
int flushed_pages = 0;
bufptr = victim_cand_list[i].bufptr;
PGBUF_BCB_LOCK (bufptr);
/* check flush conditions */
if (!VPID_EQ (&bufptr->vpid, &victim_cand_list[i].vpid) || !pgbuf_bcb_is_dirty (bufptr)
|| pgbuf_bcb_is_flushing (bufptr))
{
/* must be already flushed or currently flushing */
PGBUF_BCB_UNLOCK (bufptr);
++num_skipped_already_flushed;
continue;
}
if (!PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (bufptr) || pgbuf_is_bcb_fixed_by_any (bufptr, false))
{
/* page was fixed or became hot after selected as victim. do not flush it. */
PGBUF_BCB_UNLOCK (bufptr);
++num_skipped_fixed_or_hot;
continue;
}
if (logpb_need_wal (&bufptr->iopage_buffer->iopage.prv.lsa))
{
/* we cannot flush a page unless log has been flushed up until page LSA. otherwise we might have recovery
* issues. */
count_need_wal++;
if (LSA_ISNULL (&lsa_need_wal) || LSA_LE (&lsa_need_wal, &(bufptr->iopage_buffer->iopage.prv.lsa)))
{
LSA_COPY (&lsa_need_wal, &(bufptr->iopage_buffer->iopage.prv.lsa));
}
PGBUF_BCB_UNLOCK (bufptr);
++num_skipped_need_wal;
#if defined (SERVER_MODE)
log_wakeup_log_flush_daemon ();
#endif /* SERVER_MODE */
continue;
}
if (PGBUF_NEIGHBOR_PAGES > 1)
{
error = pgbuf_flush_page_and_neighbors_fb (thread_p, bufptr, &flushed_pages);
/* BCB mutex already unlocked by neighbor flush function */
}
else
{
error = pgbuf_bcb_flush_with_wal (thread_p, bufptr, true, &is_bcb_locked);
if (is_bcb_locked)
{
PGBUF_BCB_UNLOCK (bufptr);
}
flushed_pages = 1;
}
if (error != NO_ERROR)
{
/* if this shows up in statistics or log, consider it a red flag */
if (logging)
{
_er_log_debug (ARG_FILE_LINE, "pgbuf_flush_victim_candidates: error during flush");
}
goto end;
}
total_flushed_count += flushed_pages;
}
num_skipped_flush = num_skipped_need_wal + num_skipped_fixed_or_hot + num_skipped_already_flushed;
if (perf_tracker->is_perf_tracking)
{
perfmon_add_stat (thread_p, PSTAT_PB_NUM_SKIPPED_FLUSH, num_skipped_flush);
if (detailed_perf)
{
perfmon_add_stat (thread_p, PSTAT_PB_NUM_SKIPPED_NEED_WAL, num_skipped_need_wal);
perfmon_add_stat (thread_p, PSTAT_PB_NUM_SKIPPED_FIXED_OR_HOT, num_skipped_fixed_or_hot);
perfmon_add_stat (thread_p, PSTAT_PB_NUM_SKIPPED_ALREADY_FLUSHED, num_skipped_already_flushed);
}
UINT64 utime;
tsc_getticks (&perf_tracker->end_tick);
utime = tsc_elapsed_utime (perf_tracker->end_tick, perf_tracker->start_tick);
perfmon_time_stat (thread_p, PSTAT_PB_FLUSH_FLUSH, utime);
if (detailed_perf)
{
perfmon_time_bulk_stat (thread_p, PSTAT_PB_FLUSH_FLUSH_PER_PAGE, utime, total_flushed_count);
}
perf_tracker->start_tick = perf_tracker->end_tick;
}
end:
#if defined (SERVER_MODE)
if (pgbuf_is_any_thread_waiting_for_direct_victim () && victim_count != 0 && count_need_wal == victim_count)
{
/* log flush thread did not wake up in time. we must make sure log is flushed and retry. */
if (repeated)
{
/* already waited and failed again? all bcb's must have changed again (confirm by comparing save_lsa_need_wal
* and lsa_need_wal. */
assert (LSA_LT (&save_lsa_need_wal, &lsa_need_wal));
}
else
{
repeated = true;
save_lsa_need_wal = lsa_need_wal;
logpb_flush_log_for_wal (thread_p, &lsa_need_wal);
goto repeat;
}
}
pgbuf_Pool.is_flushing_victims = false;
#endif /* SERVER_MODE */
if (logging)
{
_er_log_debug (ARG_FILE_LINE,
"pgbuf_flush_victim_candidates: flush %d pages from lru lists.\n"
"\tvictim_count = %d\n"
"\tcheck_count_lru = %d\n"
"\tnum_skipped_need_wal = %d\n"
"\tnum_skipped_fixed_or_hot = %d\n"
"\tnum_skipped_already_flushed = %d\n",
total_flushed_count, victim_count, check_count_lru, num_skipped_need_wal, num_skipped_fixed_or_hot,
num_skipped_already_flushed);
}
er_set (ER_NOTIFICATION_SEVERITY, ARG_FILE_LINE, ER_LOG_FLUSH_VICTIM_FINISHED, 1, total_flushed_count);
perfmon_add_stat (thread_p, PSTAT_PB_NUM_FLUSHED, total_flushed_count);
return error;
}
/*
* pgbuf_flush_checkpoint () - Flush any unfixed dirty page whose lsa is smaller than the last checkpoint lsa
* return:error code or NO_ERROR
* flush_upto_lsa(in):
* prev_chkpt_redo_lsa(in): Redo_LSA of previous checkpoint
* smallest_lsa(out): Smallest LSA of a dirty buffer in buffer pool
* flushed_page_cnt(out): The number of flushed pages
*
* Note: The function flushes and dirty unfixed page whose LSA is smaller that the last_chkpt_lsa,
* it returns the smallest_lsa from the remaining dirty buffers which were not flushed.
* This function is used by the log and recovery manager when a checkpoint is issued.
*/
int
pgbuf_flush_checkpoint (THREAD_ENTRY * thread_p, const LOG_LSA * flush_upto_lsa, const LOG_LSA * prev_chkpt_redo_lsa,
LOG_LSA * smallest_lsa, int *flushed_page_cnt)
{
#define detailed_er_log(...) if (detailed_logging) _er_log_debug (ARG_FILE_LINE, __VA_ARGS__)
PGBUF_BCB *bufptr;
int bufid;
int flushed_page_cnt_local = 0;
PGBUF_SEQ_FLUSHER *seq_flusher;
PGBUF_VICTIM_CANDIDATE_LIST *f_list;
int collected_bcbs;
int error = NO_ERROR;
bool detailed_logging = prm_get_bool_value (PRM_ID_LOG_CHKPT_DETAILED);
detailed_er_log ("pgbuf_flush_checkpoint start : flush_upto_LSA:%d, prev_chkpt_redo_LSA:%d\n",
flush_upto_lsa->pageid, (prev_chkpt_redo_lsa ? prev_chkpt_redo_lsa->pageid : -1));
if (flushed_page_cnt != NULL)
{
*flushed_page_cnt = -1;
}
/* Things must be truly flushed up to this lsa */
logpb_flush_log_for_wal (thread_p, flush_upto_lsa);
LSA_SET_NULL (smallest_lsa);
seq_flusher = &(pgbuf_Pool.seq_chkpt_flusher);
f_list = seq_flusher->flush_list;
LSA_COPY (&seq_flusher->flush_upto_lsa, flush_upto_lsa);
detailed_er_log ("pgbuf_flush_checkpoint start : start\n");
collected_bcbs = 0;
#if defined (SERVER_MODE)
pgbuf_Pool.is_checkpoint = true;
#endif
for (bufid = 0; bufid < pgbuf_Pool.num_buffers; bufid++)
{
if (collected_bcbs >= seq_flusher->flush_max_size)
{
/* flush exiting list */
seq_flusher->flush_cnt = collected_bcbs;
seq_flusher->flush_idx = 0;
qsort (f_list, seq_flusher->flush_cnt, sizeof (f_list[0]), pgbuf_compare_victim_list);
error = pgbuf_flush_chkpt_seq_list (thread_p, seq_flusher, prev_chkpt_redo_lsa, smallest_lsa);
if (error != NO_ERROR)
{
#if defined (SERVER_MODE)
pgbuf_Pool.is_checkpoint = false;
#endif
return error;
}
flushed_page_cnt_local += seq_flusher->flushed_pages;
collected_bcbs = 0;
}
bufptr = PGBUF_FIND_BCB_PTR (bufid);
PGBUF_BCB_LOCK (bufptr);
/* flush condition check */
if (!pgbuf_bcb_is_dirty (bufptr)
|| (!LSA_ISNULL (&bufptr->oldest_unflush_lsa) && LSA_GT (&bufptr->oldest_unflush_lsa, flush_upto_lsa))
|| pgbuf_is_temporary_volume (bufptr->vpid.volid))
{
PGBUF_BCB_UNLOCK (bufptr);
continue;
}
if (!LSA_ISNULL (&bufptr->oldest_unflush_lsa) && prev_chkpt_redo_lsa != NULL && !LSA_ISNULL (prev_chkpt_redo_lsa))
{
if (LSA_LT (&bufptr->oldest_unflush_lsa, prev_chkpt_redo_lsa))
{
er_stack_push ();
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_LOG_CHECKPOINT_SKIP_INVALID_PAGE, 6, bufptr->vpid.pageid,
fileio_get_volume_label (bufptr->vpid.volid, PEEK), bufptr->oldest_unflush_lsa.pageid,
bufptr->oldest_unflush_lsa.offset, prev_chkpt_redo_lsa->pageid, prev_chkpt_redo_lsa->offset);
er_stack_pop ();
assert (false);
}
}
/* add to flush list */
f_list[collected_bcbs].bufptr = bufptr;
VPID_COPY (&f_list[collected_bcbs].vpid, &bufptr->vpid);
PGBUF_BCB_UNLOCK (bufptr);
collected_bcbs++;
#if defined(SERVER_MODE)
if (thread_p != NULL && thread_p->shutdown == true)
{
pgbuf_Pool.is_checkpoint = false;
return ER_FAILED;
}
#endif
}
if (collected_bcbs > 0)
{
/* flush exiting list */
seq_flusher->flush_cnt = collected_bcbs;
seq_flusher->flush_idx = 0;
qsort (f_list, seq_flusher->flush_cnt, sizeof (f_list[0]), pgbuf_compare_victim_list);
error = pgbuf_flush_chkpt_seq_list (thread_p, seq_flusher, prev_chkpt_redo_lsa, smallest_lsa);
flushed_page_cnt_local += seq_flusher->flushed_pages;
}
#if defined (SERVER_MODE)
pgbuf_Pool.is_checkpoint = false;
#endif
detailed_er_log ("pgbuf_flush_checkpoint END flushed:%d\n", flushed_page_cnt_local);
if (flushed_page_cnt != NULL)
{
*flushed_page_cnt = flushed_page_cnt_local;
}
return error;
#undef detailed_er_log
}
/*
* pgbuf_flush_chkpt_seq_list () - flush a sequence of pages during checkpoint
* return:error code or NO_ERROR
* thread_p(in):
* seq_flusher(in): container for list of pages
* prev_chkpt_redo_lsa(in): LSA of previous checkpoint
* chkpt_smallest_lsa(out): smallest LSA found in a page
*
*/
static int
pgbuf_flush_chkpt_seq_list (THREAD_ENTRY * thread_p, PGBUF_SEQ_FLUSHER * seq_flusher,
const LOG_LSA * prev_chkpt_redo_lsa, LOG_LSA * chkpt_smallest_lsa)
{
#define WAIT_FLUSH_VICTIMS_MAX_MSEC 1500.0f
int error = NO_ERROR;
struct timeval *p_limit_time;
int total_flushed;
int time_rem;
#if defined (SERVER_MODE)
int flush_interval, sleep_msecs;
float wait_victims;
float chkpt_flush_rate;
struct timeval limit_time = { 0, 0 };
struct timeval cur_time = { 0, 0 };
#endif
#if defined (SERVER_MODE)
sleep_msecs = prm_get_integer_value (PRM_ID_LOG_CHECKPOINT_SLEEP_MSECS);
if (sleep_msecs > 0)
{
chkpt_flush_rate = 1000.0f / (float) sleep_msecs;
}
else
{
chkpt_flush_rate = 1000.0f;
}
flush_interval = (int) (1000.0f * PGBUF_CHKPT_BURST_PAGES / chkpt_flush_rate);
seq_flusher->interval_msec = flush_interval;
#endif
total_flushed = 0;
seq_flusher->control_flushed = 0;
seq_flusher->control_intervals_cnt = 0;
while (seq_flusher->flush_idx < seq_flusher->flush_cnt)
{
#if defined (SERVER_MODE)
if (thread_p != NULL && thread_p->shutdown)
{
// stop
return ER_FAILED;
}
gettimeofday (&cur_time, NULL);
/* compute time limit for allowed flush interval */
timeval_add_msec (&limit_time, &cur_time, flush_interval);
seq_flusher->flush_rate = chkpt_flush_rate;
p_limit_time = &limit_time;
#else
p_limit_time = NULL;
#endif
#if defined (SERVER_MODE)
wait_victims = 0;
while (pgbuf_Pool.is_flushing_victims == true && wait_victims < WAIT_FLUSH_VICTIMS_MAX_MSEC)
{
/* wait 100 micro-seconds */
thread_sleep (0.1f);
wait_victims += 0.1f;
}
#endif
error = pgbuf_flush_seq_list (thread_p, seq_flusher, p_limit_time, prev_chkpt_redo_lsa, chkpt_smallest_lsa,
&time_rem);
total_flushed += seq_flusher->flushed_pages;
if (error != NO_ERROR)
{
seq_flusher->flushed_pages = total_flushed;
return error;
}
#if defined (SERVER_MODE)
if (time_rem > 0)
{
thread_sleep (time_rem);
}
#endif
}
seq_flusher->flushed_pages = total_flushed;
return error;
#undef WAIT_FLUSH_VICTIMS_MAX_MSEC
}
/*
* pgbuf_flush_seq_list () - flushes a sequence of pages
* return:error code or NO_ERROR
* thread_p(in):
* seq_flusher(in): container for list of pages
* limit_time(in): absolute time limit allowed for this call
* prev_chkpt_redo_lsa(in): LSA of previous checkpoint
* chkpt_smallest_lsa(out): smallest LSA found in a page
* time_rem(in): time remaining until limit time expires
*
* Note : burst_mode from seq_flusher container controls how the flush is performed:
* - if enabled, an amount of pages is flushed as soon as possible,
* according to desired flush rate and time limit
* - if disabled, the same amount of pages is flushed, but with a
* pause between each flushed page.
* Since data flush is concurrent with other IO, burst mode increases
* the chance that data and other IO sequences do not mix at IO
* scheduler level and break each-other's sequentiality.
*/
static int
pgbuf_flush_seq_list (THREAD_ENTRY * thread_p, PGBUF_SEQ_FLUSHER * seq_flusher, struct timeval *limit_time,
const LOG_LSA * prev_chkpt_redo_lsa, LOG_LSA * chkpt_smallest_lsa, int *time_rem)
{
#define detailed_er_log(...) if (detailed_logging) _er_log_debug (ARG_FILE_LINE, __VA_ARGS__)
PGBUF_BCB *bufptr;
PGBUF_VICTIM_CANDIDATE_LIST *f_list;
int error = NO_ERROR;
int avail_time_msec = 0, time_rem_msec = 0;
#if defined (SERVER_MODE)
double sleep_msecs = 0;
struct timeval cur_time = { 0, 0 };
#endif /* SERVER_MODE */
int flush_per_interval;
int cnt_writes;
int dropped_pages;
bool done_flush;
float control_est_flush_total = 0;
int control_total_cnt_intervals = 0;
bool ignore_time_limit = false;
bool flush_if_already_flushed;
bool locked_bcb = false;
bool detailed_logging = prm_get_bool_value (PRM_ID_LOG_CHKPT_DETAILED);
assert (seq_flusher != NULL);
f_list = seq_flusher->flush_list;
#if defined (SERVER_MODE)
gettimeofday (&cur_time, NULL);
if (seq_flusher->burst_mode == true)
{
assert_release (limit_time != NULL);
}
*time_rem = 0;
if (limit_time != NULL)
{
/* limited time job: amount to flush in this interval */
avail_time_msec = (int) timeval_diff_in_msec (limit_time, &cur_time);
control_total_cnt_intervals = (int) (1000.f / (float) seq_flusher->interval_msec + 0.5f);
if (seq_flusher->control_intervals_cnt > 0)
{
control_est_flush_total =
(seq_flusher->flush_rate * (float) (seq_flusher->control_intervals_cnt + 1) /
(float) control_total_cnt_intervals);
flush_per_interval = (int) (control_est_flush_total - seq_flusher->control_flushed);
}
else
{
flush_per_interval = (int) (seq_flusher->flush_rate / control_total_cnt_intervals);
if (seq_flusher->control_intervals_cnt < 0)
{
flush_per_interval -= seq_flusher->control_flushed;
}
}
}
else
{
/* flush all */
avail_time_msec = -1;
flush_per_interval = seq_flusher->flush_cnt;
}
flush_per_interval =
(int) MAX (flush_per_interval, (PGBUF_CHKPT_MIN_FLUSH_RATE * seq_flusher->interval_msec) / 1000.0f);
#else
flush_per_interval = seq_flusher->flush_cnt;
#endif /* SERVER_MODE */
detailed_er_log ("pgbuf_flush_seq_list (%s): start_idx:%d, flush_cnt:%d, LSA_flush:%d, "
"flush_rate:%.2f, control_flushed:%d, this_interval:%d, "
"Est_tot_flush:%.2f, control_intervals:%d, %d Avail_time:%d\n", "chkpt",
seq_flusher->flush_idx, seq_flusher->flush_cnt, seq_flusher->flush_upto_lsa.pageid,
seq_flusher->flush_rate, seq_flusher->control_flushed, flush_per_interval, control_est_flush_total,
seq_flusher->control_intervals_cnt, control_total_cnt_intervals, avail_time_msec);
/* Start to flush */
cnt_writes = 0;
dropped_pages = 0;
seq_flusher->flushed_pages = 0;
for (; seq_flusher->flush_idx < seq_flusher->flush_cnt && seq_flusher->flushed_pages < flush_per_interval;
seq_flusher->flush_idx++)
{
bufptr = f_list[seq_flusher->flush_idx].bufptr;
/* prefer sequentiality to an unnecessary flush; skip already flushed page if is the last in list or if there is
* already a gap due to missing next page */
flush_if_already_flushed = true;
if (seq_flusher->flush_idx + 1 >= seq_flusher->flush_cnt
|| f_list[seq_flusher->flush_idx].vpid.pageid + 1 != f_list[seq_flusher->flush_idx + 1].vpid.pageid)
{
flush_if_already_flushed = false;
}
PGBUF_BCB_LOCK (bufptr);
locked_bcb = true;
if (!VPID_EQ (&bufptr->vpid, &f_list[seq_flusher->flush_idx].vpid) || !pgbuf_bcb_is_dirty (bufptr)
|| (flush_if_already_flushed == false && !LSA_ISNULL (&bufptr->oldest_unflush_lsa)
&& LSA_GT (&bufptr->oldest_unflush_lsa, &seq_flusher->flush_upto_lsa)))
{
PGBUF_BCB_UNLOCK (bufptr);
dropped_pages++;
continue;
}
done_flush = false;
if (pgbuf_bcb_safe_flush_force_lock (thread_p, bufptr, true) == NO_ERROR)
{
if (!LSA_ISNULL (&bufptr->oldest_unflush_lsa)
&& LSA_LE (&bufptr->oldest_unflush_lsa, &seq_flusher->flush_upto_lsa))
{
/* I am not sure if this is really possible. But let's assume that bcb was already flushing before
* checkpoint reached it. And that it was modified again. And that the new oldest_unflush_lsa is less than
* flush_upto_lsa. It may seem that many planets should align, but let's be conservative and flush again.
*/
detailed_er_log ("pgbuf_flush_seq_list: flush again %d|%d; oldest_unflush_lsa=%lld|%d, "
"flush_upto_lsa=%lld|%d \n", VPID_AS_ARGS (&bufptr->vpid),
LSA_AS_ARGS (&bufptr->oldest_unflush_lsa), LSA_AS_ARGS (&seq_flusher->flush_upto_lsa));
if (pgbuf_bcb_safe_flush_internal (thread_p, bufptr, true, &locked_bcb) == NO_ERROR)
{
/* now we should be ok. */
assert (LSA_ISNULL (&bufptr->oldest_unflush_lsa)
|| LSA_GT (&bufptr->oldest_unflush_lsa, &seq_flusher->flush_upto_lsa));
done_flush = true;
}
else
{
assert (false);
}
}
else
{
done_flush = true;
}
}
else
{
assert (false);
locked_bcb = false;
}
if (done_flush)
{
seq_flusher->flushed_pages++;
}
else
{
assert (false);
if (!locked_bcb)
{
PGBUF_BCB_LOCK (bufptr);
locked_bcb = true;
}
/* get the smallest oldest_unflush_lsa */
if (!LSA_ISNULL (&bufptr->oldest_unflush_lsa)
&& (LSA_ISNULL (chkpt_smallest_lsa) || LSA_LT (&bufptr->oldest_unflush_lsa, chkpt_smallest_lsa)))
{
LSA_COPY (chkpt_smallest_lsa, &bufptr->oldest_unflush_lsa);
}
}
if (locked_bcb)
{
PGBUF_BCB_UNLOCK (bufptr);
locked_bcb = false;
}
#if defined(SERVER_MODE)
if (limit_time != NULL && ignore_time_limit == false)
{
gettimeofday (&cur_time, NULL);
if (cur_time.tv_sec > limit_time->tv_sec
|| (cur_time.tv_sec == limit_time->tv_sec && cur_time.tv_usec >= limit_time->tv_usec))
{
*time_rem = -1;
break;
}
}
if (seq_flusher->burst_mode == false && seq_flusher->flush_rate > 0
&& seq_flusher->flushed_pages < flush_per_interval && ignore_time_limit == false)
{
if (limit_time != NULL)
{
time_rem_msec = (int) timeval_diff_in_msec (limit_time, &cur_time);
sleep_msecs = time_rem_msec / (flush_per_interval - seq_flusher->flushed_pages);
}
else
{
sleep_msecs = 1000.0f / (double) (seq_flusher->flush_rate);
}
if (sleep_msecs > (1000.0f / PGBUF_CHKPT_MAX_FLUSH_RATE))
{
thread_sleep (sleep_msecs);
}
}
if (thread_p && thread_p->shutdown == true)
{
return ER_FAILED;
}
#endif /* SERVER_MODE */
}
#if defined (SERVER_MODE)
gettimeofday (&cur_time, NULL);
if (limit_time != NULL)
{
time_rem_msec = (int) timeval_diff_in_msec (limit_time, &cur_time);
*time_rem = time_rem_msec;
seq_flusher->control_intervals_cnt++;
if (seq_flusher->control_intervals_cnt >= control_total_cnt_intervals || ignore_time_limit == true)
{
seq_flusher->control_intervals_cnt = 0;
}
if (seq_flusher->control_intervals_cnt == 0)
{
seq_flusher->control_flushed = 0;
}
else
{
seq_flusher->control_flushed += seq_flusher->flushed_pages;
}
}
#endif /* SERVER_MODE */
detailed_er_log ("pgbuf_flush_seq_list end (%s): %s %s pages : %d written/%d dropped, "
"Remaining_time:%d, Avail_time:%d, Curr:%d/%d,", "ckpt",
((time_rem_msec <= 0) ? "[Expired] " : ""), (ignore_time_limit ? "[boost]" : ""),
seq_flusher->flushed_pages, dropped_pages, time_rem_msec, avail_time_msec, seq_flusher->flush_idx,
seq_flusher->flush_cnt);
return error;
#undef detailed_er_log
}
/*
* pgbuf_copy_to_area () - Copy a portion of a page to the given area
* return: area or NULL
* vpid(in): Complete Page identifier
* start_offset(in): Start offset of interested content in page
* length(in): Length of the content of page to copy
* area(in): Area where to copy the needed content of the page
* do_fetch(in): Do we want to cache the page in the buffer pool when it is
* not already cached?
*
* Note: If the page is not in the page buffer pool, it is only buffered when
* the value of "do_fetch" is false.
*
* WARNING:
* The user should be very careful on deciding wheater or not to allow
* buffering of pages. If the page is going to be used in the short
* future, it is better to allow buffering the page to avoid extra I/O.
* It is better to avoid I/Os than to avoid memcpys.
*/
void *
pgbuf_copy_to_area (THREAD_ENTRY * thread_p, const VPID * vpid, int start_offset, int length, void *area, bool do_fetch)
{
PGBUF_BUFFER_HASH *hash_anchor;
PGBUF_BCB *bufptr;
PAGE_PTR pgptr;
if (logtb_get_check_interrupt (thread_p) == true)
{
if (logtb_is_interrupted (thread_p, true, &pgbuf_Pool.check_for_interrupts) == true)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_INTERRUPTED, 0);
return NULL;
}
}
#if defined(CUBRID_DEBUG)
if (start_offset < 0 || (start_offset + length) > DB_PAGESIZE)
{
er_log_debug (ARG_FILE_LINE,
"pgbuf_copy_to_area: SYSTEM ERROR.. Trying to copy"
" from beyond page boundary limits. Start_offset = %d, length = %d\n", start_offset, length);
er_set (ER_FATAL_ERROR_SEVERITY, ARG_FILE_LINE, ER_GENERIC_ERROR, 0);
return NULL;
}
#endif /* CUBRID_DEBUG */
/* Is this a resident page ? */
hash_anchor = &(pgbuf_Pool.buf_hash_table[PGBUF_HASH_VALUE (vpid)]);
bufptr = pgbuf_search_hash_chain (thread_p, hash_anchor, vpid);
if (bufptr == NULL)
{
/* the caller is holding only hash_anchor->hash_mutex. */
/* release hash mutex */
pthread_mutex_unlock (&hash_anchor->hash_mutex);
if (er_errid () == ER_CSS_PTHREAD_MUTEX_TRYLOCK)
{
return NULL;
}
/* The page is not on the buffer pool. Do we want to cache the page ? */
if (do_fetch == true)
{
pgptr = pgbuf_fix (thread_p, vpid, OLD_PAGE, PGBUF_LATCH_READ, PGBUF_UNCONDITIONAL_LATCH);
if (pgptr != NULL)
{
#if !defined (NDEBUG)
(void) pgbuf_check_page_ptype (thread_p, pgptr, PAGE_AREA);
#endif /* !NDEBUG */
memcpy (area, (char *) pgptr + start_offset, length);
pgbuf_unfix_and_init (thread_p, pgptr);
}
else
{
area = NULL;
}
}
#if defined(ENABLE_UNUSED_FUNCTION)
else
{
/*
* Do not cache the page in the page buffer pool.
* Read the needed portion of the page directly from disk
*/
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page (thread_p, vpid, false) != DISK_VALID)
{
return NULL;
}
}
/* Record number of reads in statistics */
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_IOREADS);
if (fileio_read_user_area (thread_p, fileio_get_volume_descriptor (vpid->volid), vpid->pageid, start_offset,
length, area) == NULL)
{
area = NULL;
}
}
#endif
}
else
{
/* the caller is holding only bufptr->mutex. */
CAST_BFPTR_TO_PGPTR (pgptr, bufptr);
#if !defined (NDEBUG)
(void) pgbuf_check_page_ptype (thread_p, pgptr, PAGE_AREA);
#endif /* !NDEBUG */
memcpy (area, (char *) pgptr + start_offset, length);
if (thread_get_sort_stats_active (thread_p))
{
perfmon_inc_stat (thread_p, PSTAT_SORT_NUM_DATA_PAGES);
}
/* release mutex */
PGBUF_BCB_UNLOCK (bufptr);
}
return area;
}
/*
* pgbuf_copy_from_area () - Copy area to a portion of given page
* return: area or NULL
* vpid(in): Complete Page identifier
* start_offset(in): Start offset of interested content in page
* length(in): Length of the content of page to copy
* area(in): Area where to copy the needed content of the page
* do_fetch(in): Do we want to cache the page in the buffer pool when it is
* not already cached?
*
* Note: Copy the content of the given area to the page starting at the given
* offset. If the page is not in the page buffer pool, it is only
* buffered when the value of "do_fetch" is not false.
*
* WARNING:
* The user should be very careful on deciding wheater or not to allow
* buffering of pages. If the page is going to be used in the short
* future, it is better to allow buffering the page to avoid extra I/O.
* If you do not buffer the page, not header recovery information is
* copied along with the write of the page. In this case, the page may
* not be able to be recovered.
* DO NOT USE THIS FEATURE IF YOU LOGGED ANYTHING RELATED TO THIS PAGE.
*/
void *
pgbuf_copy_from_area (THREAD_ENTRY * thread_p, const VPID * vpid, int start_offset, int length, void *area,
bool do_fetch, TDE_ALGORITHM tde_algo)
{
PGBUF_BUFFER_HASH *hash_anchor;
PGBUF_BCB *bufptr;
PAGE_PTR pgptr;
LOG_DATA_ADDR addr;
assert (start_offset >= 0 && (start_offset + length) <= DB_PAGESIZE);
#if defined(ENABLE_UNUSED_FUNCTION)
int vol_fd;
/* Is this a resident page ? */
hash_anchor = &(pgbuf_Pool.buf_hash_table[PGBUF_HASH_VALUE (vpid)]);
bufptr = pgbuf_search_hash_chain (thread_p, hash_anchor, vpid);
if (bufptr == NULL)
{
/* the caller is holding only hash_anchor->hash_mutex. */
pthread_mutex_unlock (&hash_anchor->hash_mutex);
if (er_errid () == ER_CSS_PTHREAD_MUTEX_TRYLOCK)
{
return NULL;
}
if (do_fetch == false)
{
/* Do not cache the page in the page buffer pool. Write the desired portion of the page directly to disk */
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page (thread_p, vpid, false) != DISK_VALID)
{
return NULL;
}
}
/* Record number of reads in statistics */
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_IOWRITES);
vol_fd = fileio_get_volume_descriptor (vpid->volid);
if (fileio_write_user_area (thread_p, vol_fd, vpid->pageid, start_offset, length, area) == NULL)
{
area = NULL;
}
return area;
}
}
else
{
/* the caller is holding only bufptr->mutex. */
PGBUF_BCB_UNLOCK (bufptr);
}
#endif
pgptr = pgbuf_fix (thread_p, vpid, NEW_PAGE, PGBUF_LATCH_WRITE, PGBUF_UNCONDITIONAL_LATCH);
if (pgptr != NULL)
{
(void) pgbuf_set_page_ptype (thread_p, pgptr, PAGE_AREA);
pgbuf_set_tde_algorithm (thread_p, pgptr, tde_algo, true);
memcpy ((char *) pgptr + start_offset, area, length);
/* Inform log manager that there is no need to log this page */
addr.vfid = NULL;
addr.pgptr = pgptr;
addr.offset = 0;
log_skip_logging (thread_p, &addr);
pgbuf_set_dirty (thread_p, pgptr, FREE);
}
else
{
area = NULL;
}
return area;
}
/*
* pgbuf_set_dirty () - Mark as modified the buffer associated with pgptr and optionally free the page
* return: void
* pgptr(in): Pointer to page
* free_page(in): Free the page too ?
*/
#if !defined(NDEBUG)
void
pgbuf_set_dirty_debug (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, bool free_page, const char *caller_file,
int caller_line, const char *caller_func)
#else
void
pgbuf_set_dirty (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, bool free_page)
#endif
{
PGBUF_BCB *bufptr;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return;
}
}
/* Get the address of the buffer from the page and set buffer dirty */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert (!VPID_ISNULL (&bufptr->vpid));
#if defined(SERVER_MODE) && !defined(NDEBUG)
if (bufptr->vpid.pageid == 0)
{
disk_volheader_check_magic (thread_p, pgptr);
}
#endif
pgbuf_set_dirty_buffer_ptr (thread_p, bufptr);
/* If free request is given, unfix the page. */
if (free_page == FREE)
{
pgbuf_unfix (thread_p, pgptr);
}
}
/*
* pgbuf_get_lsa () - Find the log sequence address of the given page
* return: page lsa
* pgptr(in): Pointer to page
*/
LOG_LSA *
pgbuf_get_lsa (PAGE_PTR pgptr)
{
FILEIO_PAGE *io_pgptr;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return NULL;
}
}
/* NOTE: Does not need to hold mutex since the page is fixed */
CAST_PGPTR_TO_IOPGPTR (io_pgptr, pgptr);
return &io_pgptr->prv.lsa;
}
/*
* pgbuf_set_lsa () - Set the log sequence address of the page to the given lsa
* return: page lsa or NULL
* pgptr(in): Pointer to page
* lsa_ptr(in): Log Sequence address
*
* Note: This function is for the exclusive use of the log and recovery manager.
*/
#if !defined(NDEBUG)
const LOG_LSA *
pgbuf_set_lsa_debug (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, const LOG_LSA * lsa_ptr, const char *caller_file,
int caller_line, const char *caller_func)
#else
const LOG_LSA *
pgbuf_set_lsa (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, const LOG_LSA * lsa_ptr)
#endif
{
PGBUF_BCB *bufptr;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return NULL;
}
}
assert (lsa_ptr != NULL);
/* NOTE: Does not need to hold mutex since the page is fixed */
/* Get the address of the buffer from the page and set buffer dirty */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
/*
* Don't change LSA of temporary volumes or auxiliary volumes.
* (e.g., those of copydb, backupdb).
*/
if (pgbuf_is_temp_lsa (bufptr->iopage_buffer->iopage.prv.lsa)
|| PGBUF_IS_AUXILIARY_VOLUME (bufptr->vpid.volid) == true)
{
return NULL;
}
/*
* Always set the lsa of temporary volumes to the special
* temp lsa, if it was somehow changed.
*/
if (pgbuf_is_temporary_volume (bufptr->vpid.volid) == true)
{
pgbuf_init_temp_page_lsa (&bufptr->iopage_buffer->iopage, IO_PAGESIZE);
if (logtb_is_current_active (thread_p))
{
return NULL;
}
}
fileio_set_page_lsa (&bufptr->iopage_buffer->iopage, lsa_ptr, IO_PAGESIZE);
/*
* If this is the first time the page is set dirty, record the new LSA
* of the page as the oldest_unflush_lsa for the page.
* We could have placed these feature when the page is set dirty,
* unfortunately, some pages are set dirty before an LSA is set.
*/
if (LSA_ISNULL (&bufptr->oldest_unflush_lsa))
{
if (LSA_LT (lsa_ptr, &log_Gl.chkpt_redo_lsa))
{
LOG_LSA chkpt_redo_lsa;
int rc;
rc = pthread_mutex_lock (&log_Gl.chkpt_lsa_lock);
LSA_COPY (&chkpt_redo_lsa, &log_Gl.chkpt_redo_lsa);
pthread_mutex_unlock (&log_Gl.chkpt_lsa_lock);
if (LSA_LT (lsa_ptr, &chkpt_redo_lsa))
{
er_stack_push ();
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_LOG_CHECKPOINT_SKIP_INVALID_PAGE, 6, bufptr->vpid.pageid,
fileio_get_volume_label (bufptr->vpid.volid, PEEK), lsa_ptr->pageid, lsa_ptr->offset,
log_Gl.chkpt_redo_lsa.pageid, log_Gl.chkpt_redo_lsa.offset);
er_stack_pop ();
assert (false);
}
}
LSA_COPY (&bufptr->oldest_unflush_lsa, lsa_ptr);
}
#if defined (NDEBUG)
/* We expect the page was or will be set as dirty before unfix. However, there might be a missing case to set dirty.
* It is correct to set dirty here. Note that we have set lsa of the page and it should be also flushed.
* But we also want to find missing cases and fix them. Make everything sure for release builds.
*/
pgbuf_set_dirty_buffer_ptr (thread_p, bufptr);
#endif /* NDEBUG */
return lsa_ptr;
}
/*
* pgbuf_reset_temp_lsa () - Reset LSA of temp volume to special temp LSA (-2,-2)
* return: void
* pgptr(in): Pointer to page
*/
void
pgbuf_reset_temp_lsa (PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
pgbuf_init_temp_page_lsa (&bufptr->iopage_buffer->iopage, IO_PAGESIZE);
}
/*
* pgbuf_set_tde_algorithm () - set tde encryption algorithm to the page
* return: void
* thread_p (in) : Thread entry
* pgptr(in): Page pointer
* tde_algo (in) : encryption algorithm - NONE, AES, ARIA
*/
void
pgbuf_set_tde_algorithm (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, TDE_ALGORITHM tde_algo, bool skip_logging)
{
FILEIO_PAGE *iopage = NULL;
TDE_ALGORITHM prev_tde_algo = TDE_ALGORITHM_NONE;
assert (tde_is_loaded () || tde_algo == TDE_ALGORITHM_NONE);
prev_tde_algo = pgbuf_get_tde_algorithm (pgptr);
if (prev_tde_algo == tde_algo)
{
return;
}
CAST_PGPTR_TO_IOPGPTR (iopage, pgptr);
tde_er_log ("pgbuf_set_tde_algorithm(): VPID = %d|%d, tde_algorithm = %s\n", iopage->prv.volid,
iopage->prv.pageid, tde_get_algorithm_name (tde_algo));
if (!skip_logging)
{
log_append_undoredo_data2 (thread_p, RVPGBUF_SET_TDE_ALGORITHM, NULL, pgptr, 0, sizeof (TDE_ALGORITHM),
sizeof (TDE_ALGORITHM), &prev_tde_algo, &tde_algo);
}
/* clear tde encryption bits */
iopage->prv.pflag &= ~FILEIO_PAGE_FLAG_ENCRYPTED_MASK;
switch (tde_algo)
{
case TDE_ALGORITHM_AES:
iopage->prv.pflag |= FILEIO_PAGE_FLAG_ENCRYPTED_AES;
break;
case TDE_ALGORITHM_ARIA:
iopage->prv.pflag |= FILEIO_PAGE_FLAG_ENCRYPTED_ARIA;
break;
case TDE_ALGORITHM_NONE:
break; // do nothing, already cleared
default:
assert (false);
}
pgbuf_set_dirty (thread_p, pgptr, DONT_FREE);
}
/*
* pgbuf_rv_set_tde_algorithm () - recovery setting tde encryption algorithm to the page
* return : NO_ERROR, or ER_code
* thread_p (in) : Thread entry
* pgptr(in): Page pointer
* tde_algo (in) : encryption algorithm - NONE, AES, ARIA
*/
int
pgbuf_rv_set_tde_algorithm (THREAD_ENTRY * thread_p, LOG_RCV * rcv)
{
FILEIO_PAGE *iopage = NULL;
PAGE_PTR pgptr = rcv->pgptr;
TDE_ALGORITHM tde_algo = *((TDE_ALGORITHM *) rcv->data);
assert (rcv->length == sizeof (TDE_ALGORITHM));
pgbuf_set_tde_algorithm (thread_p, pgptr, tde_algo, true);
return NO_ERROR;
}
/*
* pgbuf_get_tde_algorithm () - get tde encryption algorithm of the page
* return: TDE_ALGORITHM
* pgptr(in): Page pointer
* tde_algo (out) : encryption algorithm - NONE, AES, ARIA
*/
TDE_ALGORITHM
pgbuf_get_tde_algorithm (PAGE_PTR pgptr)
{
FILEIO_PAGE *iopage = NULL;
CAST_PGPTR_TO_IOPGPTR (iopage, pgptr);
// encryption algorithms are exclusive
assert (!((iopage->prv.pflag & FILEIO_PAGE_FLAG_ENCRYPTED_AES) &&
(iopage->prv.pflag & FILEIO_PAGE_FLAG_ENCRYPTED_ARIA)));
if (iopage->prv.pflag & FILEIO_PAGE_FLAG_ENCRYPTED_AES)
{
return TDE_ALGORITHM_AES;
}
else if (iopage->prv.pflag & FILEIO_PAGE_FLAG_ENCRYPTED_ARIA)
{
return TDE_ALGORITHM_ARIA;
}
else
{
return TDE_ALGORITHM_NONE;
}
}
/*
* pgbuf_get_vpid () - Find the volume and page identifier associated with the passed buffer
* return: void
* pgptr(in): Page pointer
* vpid(out): Volume and page identifier
*/
void
pgbuf_get_vpid (PAGE_PTR pgptr, VPID * vpid)
{
PGBUF_BCB *bufptr;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
VPID_SET_NULL (vpid);
return;
}
}
/* NOTE: Does not need to hold mutex since the page is fixed */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
*vpid = bufptr->vpid;
}
/*
* pgbuf_get_vpid_ptr () - Find the volume and page identifier associated with the passed buffer
* return: pointer to vpid
* pgptr(in): Page pointer
*
* Note: Once the buffer is freed, the content of the vpid pointer may be
* updated by the page buffer manager, thus a lot of care should be taken.
* The values of the vpid pointer must not be altered by the caller.
* Once the page is freed, the vpid pointer should not be used any longer.
*/
VPID *
pgbuf_get_vpid_ptr (PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return NULL;
}
}
/* NOTE: Does not need to hold mutex since the page is fixed */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
return &(bufptr->vpid);
}
/*
* pgbuf_get_latch_mode () - Find the latch mode associated with the passed buffer
* return: latch mode
* pgptr(in): Page pointer
*/
PGBUF_LATCH_MODE
pgbuf_get_latch_mode (PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return PGBUF_LATCH_INVALID;
}
}
/* NOTE: Does not need to hold mutex since the page is fixed */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
return get_latch (&bufptr->atomic_latch);
}
/*
* pgbuf_get_page_id () - Find the page identifier associated with the passed buffer
* return: PAGEID
* pgptr(in): Page pointer
*/
PAGEID
pgbuf_get_page_id (PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
/* NOTE: Does not need to hold mutex since the page is fixed */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert (pgbuf_check_bcb_page_vpid (bufptr, false) == true);
return bufptr->vpid.pageid;
}
/*
* pgbuf_get_page_ptype () -
* return:
* pgptr(in): Pointer to page
*/
PAGE_TYPE
pgbuf_get_page_ptype (THREAD_ENTRY * thread_p, PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
PAGE_TYPE ptype;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return PAGE_UNKNOWN; /* TODO - need to return error_code */
}
}
/* NOTE: Does not need to hold mutex since the page is fixed */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert_release (pgbuf_check_bcb_page_vpid (bufptr, false) == true);
ptype = (PAGE_TYPE) (bufptr->iopage_buffer->iopage.prv.ptype);
assert (PAGE_UNKNOWN <= (int) ptype);
assert (ptype <= PAGE_LAST);
return ptype;
}
/*
* pgbuf_get_volume_id () - Find the volume associated with the passed buffer
* return: VOLID
* pgptr(in): Page pointer
*/
VOLID
pgbuf_get_volume_id (PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return NULL_VOLID;
}
}
/* NOTE: Does not need to hold mutex since the page is fixed */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
return bufptr->vpid.volid;
}
/*
* pgbuf_get_volume_label () - Find the name of the volume associated with the passed buffer
* return: Volume label
* pgptr(in): Page pointer
*/
const char *
pgbuf_get_volume_label (PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
/* NOTE: Does not need to hold mutex since the page is fixed */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert (!VPID_ISNULL (&bufptr->vpid));
return fileio_get_volume_label (bufptr->vpid.volid, PEEK);
}
/*
* pgbuf_force_to_check_for_interrupts () - Force the page buffer manager
* to check for possible interrupts when pages are fetched
* return: void
* void(in):
*/
void
pgbuf_force_to_check_for_interrupts (void)
{
pgbuf_Pool.check_for_interrupts = true;
}
/*
* pgbuf_is_log_check_for_interrupts () - Force the page buffer manager to
* check for possible interrupts when pages are fetched
* return: if there is interrupt, return true, otherwise return false
* void(in):
*/
bool
pgbuf_is_log_check_for_interrupts (THREAD_ENTRY * thread_p)
{
if (pgbuf_Pool.check_for_interrupts == true
&& logtb_is_interrupted (thread_p, true, &pgbuf_Pool.check_for_interrupts) == true)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_INTERRUPTED, 0);
return true;
}
else
{
return false;
}
}
/*
* pgbuf_set_lsa_as_temporary () - The log sequence address of the page is set to temporary lsa address
* return: void
* pgptr(in): Pointer to page
*
* Note: Set the log sequence address of the page to the non recoverable LSA
* address. In this case the page is declared a non recoverable page
* (temporary page). Logging must not be done in a temporary page,
* however it is not enforced. A warning message is issued if someone
* logs something. This warning will indicate a potential bug.
*
* This function is used for debugging.
*/
void
pgbuf_set_lsa_as_temporary (THREAD_ENTRY * thread_p, PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert (!VPID_ISNULL (&bufptr->vpid));
pgbuf_init_temp_page_lsa (&bufptr->iopage_buffer->iopage, IO_PAGESIZE);
pgbuf_set_dirty_buffer_ptr (thread_p, bufptr);
}
/*
* pgbuf_set_bcb_page_vpid () -
* return: void
* bufptr(in): pointer to buffer page
*
*/
STATIC_INLINE void
pgbuf_set_bcb_page_vpid (PGBUF_BCB * bufptr)
{
if (bufptr == NULL || VPID_ISNULL (&bufptr->vpid))
{
assert (bufptr != NULL);
assert (!VPID_ISNULL (&bufptr->vpid));
return;
}
/* perm volume */
if (bufptr->vpid.volid > NULL_VOLID)
{
/* Check if is the first time */
if (bufptr->iopage_buffer->iopage.prv.pageid == NULL_PAGEID
&& bufptr->iopage_buffer->iopage.prv.volid == NULL_VOLID)
{
/* Set Page identifier */
bufptr->iopage_buffer->iopage.prv.pageid = bufptr->vpid.pageid;
bufptr->iopage_buffer->iopage.prv.volid = bufptr->vpid.volid;
bufptr->iopage_buffer->iopage.prv.ptype = PAGE_UNKNOWN;
bufptr->iopage_buffer->iopage.prv.p_reserve_1 = 0;
bufptr->iopage_buffer->iopage.prv.p_reserve_2 = 0;
bufptr->iopage_buffer->iopage.prv.tde_nonce = 0;
}
else
{
/* values not reset upon page deallocation */
assert (bufptr->iopage_buffer->iopage.prv.volid == bufptr->vpid.volid);
assert (bufptr->iopage_buffer->iopage.prv.pageid == bufptr->vpid.pageid);
}
}
}
/*
* pgbuf_set_page_ptype () -
* return: void
* pgptr(in): Pointer to page
* ptype(in): page type
*
* Note: This function is used for debugging.
*/
void
pgbuf_set_page_ptype (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, PAGE_TYPE ptype)
{
PGBUF_BCB *bufptr;
assert (pgptr != NULL);
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
assert (false);
return;
}
}
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert (!VPID_ISNULL (&bufptr->vpid));
/* Set Page identifier if needed */
pgbuf_set_bcb_page_vpid (bufptr);
if (pgbuf_check_bcb_page_vpid (bufptr, false) != true)
{
assert (false);
return;
}
bufptr->iopage_buffer->iopage.prv.ptype = (unsigned char) ptype;
assert_release (bufptr->iopage_buffer->iopage.prv.ptype == ptype);
}
/*
* pgbuf_is_lsa_temporary () - Find if the page is a temporary one
* return: true/false
* pgptr(in): Pointer to page
*/
bool
pgbuf_is_lsa_temporary (PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
if (pgbuf_is_temp_lsa (bufptr->iopage_buffer->iopage.prv.lsa)
|| pgbuf_is_temporary_volume (bufptr->vpid.volid) == true)
{
return true;
}
else
{
return false;
}
}
/*
* pgbuf_is_temporary_volume () - Find if the given permanent volume has been declared for temporary storage purposes
* return: true/false
* volid(in): Volume identifier of last allocated permanent volume
*/
STATIC_INLINE bool
pgbuf_is_temporary_volume (VOLID volid)
{
/* TODO: I don't know why page buffer should care about temporary files and what this does, but it is really annoying.
* until database is loaded and restarted, I will return false always. */
if (!LOG_ISRESTARTED ())
{
return false;
}
return xdisk_get_purpose (NULL, volid) == DB_TEMPORARY_DATA_PURPOSE;
}
/*
* pgbuf_init_BCB_table () - Initializes page buffer BCB table
* return: NO_ERROR, or ER_code
*/
static int
pgbuf_initialize_bcb_table (void)
{
PGBUF_BCB *bufptr;
PGBUF_IOPAGE_BUFFER *ioptr;
PGBUF_ATOMIC_LATCH_IMPL impl;
int i;
long long unsigned alloc_size;
impl.impl.latch_mode = PGBUF_LATCH_INVALID;
impl.impl.waiter_exists = false;
impl.impl.fcnt = 0;
/* allocate space for page buffer BCB table */
alloc_size = (long long unsigned) pgbuf_Pool.num_buffers * PGBUF_BCB_SIZEOF;
if (!MEM_SIZE_IS_VALID (alloc_size))
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_PRM_BAD_VALUE, 1, "data_buffer_pages");
return ER_PRM_BAD_VALUE;
}
pgbuf_Pool.BCB_table = (PGBUF_BCB *) malloc ((size_t) alloc_size);
if (pgbuf_Pool.BCB_table == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (size_t) alloc_size);
return ER_OUT_OF_VIRTUAL_MEMORY;
}
/* allocate space for io page buffers */
alloc_size = (long long unsigned) pgbuf_Pool.num_buffers * PGBUF_IOPAGE_BUFFER_SIZE;
if (!MEM_SIZE_IS_VALID (alloc_size))
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_PRM_BAD_VALUE, 1, "data_buffer_pages");
if (pgbuf_Pool.BCB_table != NULL)
{
free_and_init (pgbuf_Pool.BCB_table);
}
return ER_PRM_BAD_VALUE;
}
pgbuf_Pool.iopage_table = (PGBUF_IOPAGE_BUFFER *) malloc ((size_t) alloc_size);
if (pgbuf_Pool.iopage_table == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (size_t) alloc_size);
if (pgbuf_Pool.BCB_table != NULL)
{
free_and_init (pgbuf_Pool.BCB_table);
}
return ER_OUT_OF_VIRTUAL_MEMORY;
}
/* initialize each entry of the buffer BCB table */
for (i = 0; i < pgbuf_Pool.num_buffers; i++)
{
bufptr = PGBUF_FIND_BCB_PTR (i);
pthread_mutex_init (&bufptr->mutex, NULL);
#if defined (SERVER_MODE)
bufptr->owner_mutex = -1;
#endif /* SERVER_MODE */
VPID_SET_NULL (&bufptr->vpid);
placement_new (&bufptr->atomic_latch, 0);
bufptr->atomic_latch.store (impl.raw);
#if defined(SERVER_MODE)
bufptr->next_wait_thrd = NULL;
#endif /* SERVER_MODE */
#if defined(SERVER_MODE)
bufptr->latch_last_thread = NULL;
#endif /* SERVER_MODE */
bufptr->hash_next = NULL;
bufptr->prev_BCB = NULL;
if (i == (pgbuf_Pool.num_buffers - 1))
{
bufptr->next_BCB = NULL;
}
else
{
bufptr->next_BCB = PGBUF_FIND_BCB_PTR (i + 1);
}
bufptr->flags = PGBUF_BCB_INIT_FLAGS;
bufptr->count_fix_and_avoid_dealloc = 0;
bufptr->hit_age = 0;
LSA_SET_NULL (&bufptr->oldest_unflush_lsa);
bufptr->tick_lru3 = 0;
bufptr->tick_lru_list = 0;
/* link BCB and iopage buffer */
ioptr = PGBUF_FIND_IOPAGE_PTR (i);
fileio_init_lsa_of_page (&ioptr->iopage, IO_PAGESIZE);
/* Init Page identifier */
ioptr->iopage.prv.pageid = -1;
ioptr->iopage.prv.volid = -1;
ioptr->iopage.prv.ptype = (unsigned char) PAGE_UNKNOWN;
ioptr->iopage.prv.pflag = '\0';
ioptr->iopage.prv.p_reserve_1 = 0;
ioptr->iopage.prv.p_reserve_2 = 0;
ioptr->iopage.prv.tde_nonce = 0;
bufptr->iopage_buffer = ioptr;
ioptr->bcb = bufptr;
#if defined(CUBRID_DEBUG)
/* Reinitizalize the buffer */
pgbuf_scramble (&bufptr->iopage_buffer->iopage);
memcpy (PGBUF_FIND_BUFFER_GUARD (bufptr), pgbuf_Guard, sizeof (pgbuf_Guard));
#endif /* CUBRID_DEBUG */
}
return NO_ERROR;
}
/*
* pgbuf_initialize_hash_table () - Initializes page buffer hash table
* return: NO_ERROR, or ER_code
*/
static int
pgbuf_initialize_hash_table (void)
{
size_t hashsize, i;
/* allocate space for the buffer hash table */
hashsize = PGBUF_HASH_SIZE;
pgbuf_Pool.buf_hash_table = (PGBUF_BUFFER_HASH *) malloc (hashsize * PGBUF_BUFFER_HASH_SIZEOF);
if (pgbuf_Pool.buf_hash_table == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (hashsize * PGBUF_BUFFER_HASH_SIZEOF));
return ER_OUT_OF_VIRTUAL_MEMORY;
}
/* initialize each entry of the buffer hash table */
for (i = 0; i < hashsize; i++)
{
pthread_mutex_init (&pgbuf_Pool.buf_hash_table[i].hash_mutex, NULL);
pgbuf_Pool.buf_hash_table[i].hash_next = NULL;
pgbuf_Pool.buf_hash_table[i].lock_next = NULL;
}
return NO_ERROR;
}
/*
* pgbuf_initialize_lock_table () - Initializes page buffer lock table
* return: NO_ERROR, or ER_code
*/
static int
pgbuf_initialize_lock_table (void)
{
size_t i;
size_t thrd_num_total;
size_t alloc_size;
/* allocate memory space for the buffer lock table */
thrd_num_total = thread_num_total_threads ();
#if !defined(SERVER_MODE)
assert (thrd_num_total == 1);
#endif /* !SERVER_MODE */
alloc_size = thrd_num_total * PGBUF_BUFFER_LOCK_SIZEOF;
pgbuf_Pool.buf_lock_table = (PGBUF_BUFFER_LOCK *) malloc (alloc_size);
if (pgbuf_Pool.buf_lock_table == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, alloc_size);
return ER_OUT_OF_VIRTUAL_MEMORY;
}
/* initialize each entry of the buffer lock table */
for (i = 0; i < thrd_num_total; i++)
{
VPID_SET_NULL (&pgbuf_Pool.buf_lock_table[i].vpid);
pgbuf_Pool.buf_lock_table[i].lock_next = NULL;
#if defined(SERVER_MODE)
pgbuf_Pool.buf_lock_table[i].next_wait_thrd = NULL;
#endif /* SERVER_MODE */
}
return NO_ERROR;
}
/*
* pgbuf_initialize_lru_list () - Initializes the page buffer LRU list
* return: NO_ERROR, or ER_code
*/
static int
pgbuf_initialize_lru_list (void)
{
int i;
/* set the number of LRU lists */
pgbuf_Pool.num_LRU_list = prm_get_integer_value (PRM_ID_PB_NUM_LRU_CHAINS);
if (pgbuf_Pool.num_LRU_list == 0)
{
/* Default value of shared lists : # of transactions */
pgbuf_Pool.num_LRU_list = (int) MAX_NTRANS;
assert (pgbuf_Pool.num_LRU_list > 0);
if (pgbuf_Pool.num_buffers / pgbuf_Pool.num_LRU_list < PGBUF_MIN_PAGES_IN_SHARED_LIST)
{
pgbuf_Pool.num_LRU_list = pgbuf_Pool.num_buffers / PGBUF_MIN_PAGES_IN_SHARED_LIST;
}
/* should have at least 4 shared LRUs */
pgbuf_Pool.num_LRU_list = MAX (pgbuf_Pool.num_LRU_list, 4);
}
/* allocate memory space for the page buffer LRU lists */
pgbuf_Pool.buf_LRU_list = (PGBUF_LRU_LIST *) malloc (PGBUF_TOTAL_LRU_COUNT * PGBUF_LRU_LIST_SIZEOF);
if (pgbuf_Pool.buf_LRU_list == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1,
(PGBUF_TOTAL_LRU_COUNT * PGBUF_LRU_LIST_SIZEOF));
return ER_OUT_OF_VIRTUAL_MEMORY;
}
/* initialize the page buffer LRU lists */
for (i = 0; i < PGBUF_TOTAL_LRU_COUNT; i++)
{
pgbuf_Pool.buf_LRU_list[i].index = i;
pthread_mutex_init (&pgbuf_Pool.buf_LRU_list[i].mutex, NULL);
pgbuf_Pool.buf_LRU_list[i].top = NULL;
pgbuf_Pool.buf_LRU_list[i].bottom = NULL;
pgbuf_Pool.buf_LRU_list[i].bottom_1 = NULL;
pgbuf_Pool.buf_LRU_list[i].bottom_2 = NULL;
pgbuf_Pool.buf_LRU_list[i].count_lru1 = 0;
pgbuf_Pool.buf_LRU_list[i].count_lru2 = 0;
pgbuf_Pool.buf_LRU_list[i].count_lru3 = 0;
pgbuf_Pool.buf_LRU_list[i].count_vict_cand = 0;
pgbuf_Pool.buf_LRU_list[i].victim_hint = NULL;
pgbuf_Pool.buf_LRU_list[i].tick_list = 0;
pgbuf_Pool.buf_LRU_list[i].tick_lru3 = 0;
pgbuf_Pool.buf_LRU_list[i].threshold_lru1 = 0;
pgbuf_Pool.buf_LRU_list[i].threshold_lru2 = 0;
pgbuf_Pool.buf_LRU_list[i].quota = 0;
pgbuf_Pool.buf_LRU_list[i].flags = 0;
}
return NO_ERROR;
}
/*
* pgbuf_initialize_aout_list () - initialize the Aout list
* return : error code or NO_ERROR
*/
static int
pgbuf_initialize_aout_list (void)
{
/* limit Aout size to equivalent of 512M */
#define PGBUF_LIMIT_AOUT_BUFFERS 32768
int i;
float aout_ratio;
size_t alloc_size = 0;
PGBUF_AOUT_LIST *list = &pgbuf_Pool.buf_AOUT_list;
aout_ratio = prm_get_float_value (PRM_ID_PB_AOUT_RATIO);
list->max_count = (int) (pgbuf_Pool.num_buffers * aout_ratio);
list->Aout_top = NULL;
list->Aout_bottom = NULL;
list->bufarray = NULL;
list->aout_buf_ht = NULL;
pthread_mutex_init (&list->Aout_mutex, NULL);
if (aout_ratio <= 0)
{
/* not using Aout list */
list->max_count = 0;
return NO_ERROR;
}
list->max_count = MIN (list->max_count, PGBUF_LIMIT_AOUT_BUFFERS);
alloc_size = list->max_count * sizeof (PGBUF_AOUT_BUF);
list->bufarray = (PGBUF_AOUT_BUF *) malloc (alloc_size);
if (list->bufarray == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, alloc_size);
return ER_OUT_OF_VIRTUAL_MEMORY;
}
list->Aout_free = &list->bufarray[0];
for (i = 0; i < list->max_count; i++)
{
VPID_SET_NULL (&list->bufarray[i].vpid);
list->bufarray[i].lru_idx = PGBUF_AOUT_NOT_FOUND;
if (i != list->max_count - 1)
{
list->bufarray[i].next = &list->bufarray[i + 1];
}
else
{
list->bufarray[i].next = NULL;
}
list->bufarray[i].prev = NULL;
}
list->num_hashes = MAX (list->max_count / AOUT_HASH_DIVIDE_RATIO, 1);
alloc_size = list->num_hashes * sizeof (MHT_TABLE *);
list->aout_buf_ht = (MHT_TABLE **) malloc (alloc_size);
if (list->aout_buf_ht == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, alloc_size);
goto error_return;
}
memset (list->aout_buf_ht, 0, alloc_size);
for (i = 0; i < list->num_hashes; i++)
{
list->aout_buf_ht[i] = mht_create ("PGBUF_AOUT_HASH", list->max_count, pgbuf_hash_vpid, pgbuf_compare_vpid);
if (list->aout_buf_ht[i] == NULL)
{
goto error_return;
}
}
return NO_ERROR;
error_return:
list->Aout_free = NULL;
if (list->bufarray != NULL)
{
free_and_init (list->bufarray);
}
if (list->aout_buf_ht != NULL)
{
for (i = 0; list->aout_buf_ht[i] != NULL; i++)
{
mht_destroy (list->aout_buf_ht[i]);
}
free_and_init (list->aout_buf_ht);
}
pthread_mutex_destroy (&list->Aout_mutex);
return ER_FAILED;
#undef PGBUF_LIMIT_AOUT_BUFFERS
}
/*
* pgbuf_initialize_invalid_list () - Initializes the page buffer invalid list
* return: NO_ERROR
*/
static int
pgbuf_initialize_invalid_list (void)
{
/* initialize the invalid BCB list */
pthread_mutex_init (&pgbuf_Pool.buf_invalid_list.invalid_mutex, NULL);
pgbuf_Pool.buf_invalid_list.invalid_top = PGBUF_FIND_BCB_PTR (0);
pgbuf_Pool.buf_invalid_list.invalid_cnt = pgbuf_Pool.num_buffers;
return NO_ERROR;
}
/*
* pgbuf_initialize_thrd_holder () -
* return: NO_ERROR, or ER_code
*/
static int
pgbuf_initialize_thrd_holder (void)
{
size_t thrd_num_total;
size_t alloc_size;
size_t i, j, idx;
thrd_num_total = thread_num_total_threads ();
#if !defined(SERVER_MODE)
assert (thrd_num_total == 1);
#endif /* !SERVER_MODE */
pgbuf_Pool.thrd_holder_info = (PGBUF_HOLDER_ANCHOR *) malloc (thrd_num_total * PGBUF_HOLDER_ANCHOR_SIZEOF);
if (pgbuf_Pool.thrd_holder_info == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1,
thrd_num_total * PGBUF_HOLDER_ANCHOR_SIZEOF);
return ER_OUT_OF_VIRTUAL_MEMORY;
}
/* phase 1: allocate memory space that is used for BCB holder entries */
alloc_size = thrd_num_total * PGBUF_DEFAULT_FIX_COUNT * PGBUF_HOLDER_SIZEOF;
pgbuf_Pool.thrd_reserved_holder = (PGBUF_HOLDER *) malloc (alloc_size);
if (pgbuf_Pool.thrd_reserved_holder == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, alloc_size);
return ER_OUT_OF_VIRTUAL_MEMORY;
}
/* phase 2: initialize all the BCB holder entries */
/*
* Each thread has both free holder list and used(held) holder list.
* The free holder list of each thread is initialized to
* have PGBUF_DEFAULT_FIX_COUNT entries and the used holder list of
* each thread is initialized to have no entry.
*/
for (i = 0; i < thrd_num_total; i++)
{
pgbuf_Pool.thrd_holder_info[i].num_hold_cnt = 0;
pgbuf_Pool.thrd_holder_info[i].num_free_cnt = PGBUF_DEFAULT_FIX_COUNT;
pgbuf_Pool.thrd_holder_info[i].thrd_hold_list = NULL;
pgbuf_Pool.thrd_holder_info[i].thrd_free_list = &(pgbuf_Pool.thrd_reserved_holder[i * PGBUF_DEFAULT_FIX_COUNT]);
for (j = 0; j < PGBUF_DEFAULT_FIX_COUNT; j++)
{
idx = (i * PGBUF_DEFAULT_FIX_COUNT) + j;
pgbuf_Pool.thrd_reserved_holder[idx].fix_count = 0;
pgbuf_Pool.thrd_reserved_holder[idx].bufptr = NULL;
pgbuf_Pool.thrd_reserved_holder[idx].thrd_link = NULL;
INIT_HOLDER_STAT (&(pgbuf_Pool.thrd_reserved_holder[idx].perf_stat));
pgbuf_Pool.thrd_reserved_holder[idx].first_watcher = NULL;
pgbuf_Pool.thrd_reserved_holder[idx].last_watcher = NULL;
pgbuf_Pool.thrd_reserved_holder[idx].watch_count = 0;
if (j == (PGBUF_DEFAULT_FIX_COUNT - 1))
{
pgbuf_Pool.thrd_reserved_holder[idx].next_holder = NULL;
}
else
{
pgbuf_Pool.thrd_reserved_holder[idx].next_holder = &(pgbuf_Pool.thrd_reserved_holder[idx + 1]);
}
}
}
/* phase 3: initialize free BCB holder list shared by all threads */
pthread_mutex_init (&pgbuf_Pool.free_holder_set_mutex, NULL);
pgbuf_Pool.free_holder_set = NULL;
pgbuf_Pool.free_index = -1; /* -1 means that there is no free holder entry */
return NO_ERROR;
}
/*
* pgbuf_allocate_thrd_holder_entry () - Allocates one buffer holder entry
* from the free holder list of given thread
* return: pointer to holder entry or NULL
*
* Note: If the free holder list is empty,
* allocate it from the list of free holder arrays that is shared.
*/
STATIC_INLINE PGBUF_HOLDER *
pgbuf_allocate_thrd_holder_entry (THREAD_ENTRY * thread_p)
{
int thrd_index;
PGBUF_HOLDER_ANCHOR *thrd_holder_info;
PGBUF_HOLDER *holder;
PGBUF_HOLDER_SET *holder_set;
#if defined(SERVER_MODE)
int rv;
#endif /* SERVER_MODE */
if (thread_p == NULL)
{
thread_p = thread_get_thread_entry_info ();
assert (thread_p != NULL);
}
if (!thread_p->m_holder_anchor)
{
thread_p->m_holder_anchor = &pgbuf_Pool.thrd_holder_info[thread_p->index];
}
thrd_holder_info = thread_p->m_holder_anchor;
if (thrd_holder_info->thrd_free_list != NULL)
{
/* allocate a BCB holder entry from the free BCB holder list of given thread */
holder = thrd_holder_info->thrd_free_list;
thrd_holder_info->thrd_free_list = holder->next_holder;
thrd_holder_info->num_free_cnt -= 1;
}
else
{
/* holder == NULL : free BCB holder list is empty */
/* allocate a BCB holder entry from the free BCB holder list shared by all threads. */
rv = pthread_mutex_lock (&pgbuf_Pool.free_holder_set_mutex);
if (pgbuf_Pool.free_index == -1)
{
/* no usable free holder entry */
/* expand the free BCB holder list shared by threads */
holder_set = (PGBUF_HOLDER_SET *) malloc (PGBUF_HOLDER_SET_SIZEOF);
if (holder_set == NULL)
{
/* This situation must not be occurred. */
assert (false);
pthread_mutex_unlock (&pgbuf_Pool.free_holder_set_mutex);
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, PGBUF_HOLDER_SET_SIZEOF);
return NULL;
}
holder_set->next_set = pgbuf_Pool.free_holder_set;
pgbuf_Pool.free_holder_set = holder_set;
pgbuf_Pool.free_index = 0;
}
holder = &(pgbuf_Pool.free_holder_set->element[pgbuf_Pool.free_index]);
pgbuf_Pool.free_index += 1;
if (pgbuf_Pool.free_index == PGBUF_NUM_ALLOC_HOLDER)
{
pgbuf_Pool.free_index = -1;
}
pthread_mutex_unlock (&pgbuf_Pool.free_holder_set_mutex);
/* initialize the newly allocated BCB holder entry */
holder->thrd_link = NULL;
}
holder->next_holder = NULL; /* disconnect from free BCB holder list */
/* connect the BCB holder entry at the head of thread's holder list */
holder->thrd_link = thrd_holder_info->thrd_hold_list;
thrd_holder_info->thrd_hold_list = holder;
thrd_holder_info->num_hold_cnt += 1;
holder->first_watcher = NULL;
holder->last_watcher = NULL;
holder->watch_count = 0;
return holder;
}
/*
* pgbuf_find_thrd_holder () - Find the holder entry of current thread on the BCB holder list of given BCB
* return: pointer to holder entry or NULL
* bufptr(in):
*/
STATIC_INLINE PGBUF_HOLDER *
pgbuf_find_thrd_holder (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
{
int thrd_index;
PGBUF_HOLDER *holder;
assert (bufptr != NULL);
if (thread_p == NULL)
{
thread_p = thread_get_thread_entry_info ();
assert (thread_p != NULL);
}
if (!thread_p->m_holder_anchor)
{
thread_p->m_holder_anchor = &pgbuf_Pool.thrd_holder_info[thread_p->index];
}
holder = thread_p->m_holder_anchor->thrd_hold_list;
while (holder != NULL)
{
assert (holder->next_holder == NULL);
if (holder->bufptr == bufptr)
{
break; /* found */
}
holder = holder->thrd_link;
}
return holder;
}
/*
* pgbuf_unlatch_thrd_holder () - decrements fix_count by one to the holder entry of current thread on the BCB holder
* list of given BCB
* return: pointer to holder entry or NULL
* bufptr(in):
*/
STATIC_INLINE int
pgbuf_unlatch_thrd_holder (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, PGBUF_HOLDER_STAT * holder_perf_stat_p)
{
int err = NO_ERROR;
PGBUF_HOLDER *holder;
PAGE_PTR pgptr;
assert (bufptr != NULL);
CAST_BFPTR_TO_PGPTR (pgptr, bufptr);
holder = pgbuf_find_thrd_holder (thread_p, bufptr);
if (holder == NULL)
{
/* This situation must not be occurred. */
assert (false);
err = ER_PB_UNFIXED_PAGEPTR;
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, err, 3, pgptr, bufptr->vpid.pageid,
fileio_get_volume_label (bufptr->vpid.volid, PEEK));
goto exit_on_error;
}
if (holder_perf_stat_p != NULL)
{
*holder_perf_stat_p = holder->perf_stat;
}
holder->fix_count--;
if (holder->fix_count == 0)
{
/* remove its own BCB holder entry */
if (pgbuf_remove_thrd_holder (thread_p, holder) != NO_ERROR)
{
/* This situation must not be occurred. */
assert (false);
err = ER_PB_UNFIXED_PAGEPTR;
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, err, 3, pgptr, bufptr->vpid.pageid,
fileio_get_volume_label (bufptr->vpid.volid, PEEK));
goto exit_on_error;
}
}
assert (err == NO_ERROR);
exit_on_error:
return err;
}
/*
* pgbuf_remove_thrd_holder () - Remove holder entry from given BCB
* return: NO_ERROR, or ER_code
* holder(in): pointer to holder entry to be removed
*
* Note: This function removes the given holder entry from the holder list of
* given BCB, and then connect it to the free holder list of the
* corresponding thread.
*/
STATIC_INLINE int
pgbuf_remove_thrd_holder (THREAD_ENTRY * thread_p, PGBUF_HOLDER * holder)
{
int err = NO_ERROR;
int thrd_index;
PGBUF_HOLDER_ANCHOR *thrd_holder_info;
PGBUF_HOLDER *prev;
int found;
assert (holder != NULL);
assert (holder->fix_count == 0);
assert (holder->watch_count == 0);
/* holder->fix_count is always set to some meaningful value when the holder entry is allocated for use. So, at this
* time, we do not need to initialize it. connect the BCB holder entry into free BCB holder list of given thread. */
if (thread_p == NULL)
{
thread_p = thread_get_thread_entry_info ();
assert (thread_p != NULL);
}
if (!thread_p->m_holder_anchor)
{
thread_p->m_holder_anchor = &pgbuf_Pool.thrd_holder_info[thread_p->index];
}
thrd_holder_info = thread_p->m_holder_anchor;
holder->next_holder = thrd_holder_info->thrd_free_list;
thrd_holder_info->thrd_free_list = holder;
thrd_holder_info->num_free_cnt += 1;
/* remove the BCB holder entry from thread's holder list */
if (thrd_holder_info->thrd_hold_list == NULL)
{
/* This situation must not be occurred. */
assert (false);
err = ER_FAILED;
goto exit_on_error;
}
if (thrd_holder_info->thrd_hold_list == (PGBUF_HOLDER *) holder)
{
thrd_holder_info->thrd_hold_list = holder->thrd_link;
}
else
{
found = false;
prev = thrd_holder_info->thrd_hold_list;
while (prev->thrd_link != NULL)
{
assert (prev->next_holder == NULL);
if (prev->thrd_link == (PGBUF_HOLDER *) holder)
{
prev->thrd_link = holder->thrd_link;
holder->thrd_link = NULL;
found = true;
break;
}
prev = prev->thrd_link;
}
if (found == false)
{
/* This situation must not be occurred. */
assert (false);
err = ER_FAILED;
goto exit_on_error;
}
}
thrd_holder_info->num_hold_cnt -= 1;
assert (err == NO_ERROR);
exit_on_error:
return err;
}
/*
* pgbuf_latch_bcb_upon_fix () -
* return: NO_ERROR, or ER_code
* bufptr(in):
* request_mode(in):
* buf_lock_acquired(in):
* condition(in):
*
* Note: This function latches BCB with latch mode LatchMode as far as
* LatchMode is compatible with bcb->LatchMode and there is not any
* blocked reader or writer.
* If it cannot latch the BCB right away,
* (1) in case of conditional request,
* release mutex and return eERROR.
* (2) in case of unconditional request, add thread on the
* BCB queue and release mutex and block the thread.
* In any case, if LeafLatchMode is not NO_LATCH and the PageType
* of the page that BCB points is P_BPLEAF, latch BCB with latch
* mode LeafLatchMode.
*/
STATIC_INLINE int
pgbuf_latch_bcb_upon_fix (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, PGBUF_LATCH_MODE request_mode,
int buf_lock_acquired, PGBUF_LATCH_CONDITION condition, bool * is_latch_wait)
{
PGBUF_HOLDER *holder = NULL;
int request_fcnt = 1;
bool is_page_idle;
bool buf_is_dirty;
PGBUF_ATOMIC_LATCH_IMPL old_impl, new_impl;
bool can_latch, promote_needed;
// *INDENT-OFF*
scope_exit unlock_BCB ([bufptr] ()
{
PGBUF_BCB_UNLOCK (bufptr);
});
// *INDENT-ON*
/* parameter validation */
assert (request_mode == PGBUF_LATCH_READ || request_mode == PGBUF_LATCH_WRITE);
assert (condition == PGBUF_UNCONDITIONAL_LATCH || condition == PGBUF_CONDITIONAL_LATCH);
assert (is_latch_wait != NULL);
*is_latch_wait = false;
holder = pgbuf_find_thrd_holder (thread_p, bufptr);
// *INDENT-OFF*
do
{
promote_needed = false;
is_page_idle = false;
can_latch = false;
request_fcnt = 1;
old_impl = get_impl (&bufptr->atomic_latch);
new_impl = old_impl;
if (buf_lock_acquired || old_impl.impl.latch_mode == PGBUF_NO_LATCH)
{
is_page_idle = true;
}
#if defined (SA_MODE)
else
{
holder = pgbuf_find_thrd_holder (thread_p, bufptr);
if (holder == NULL)
{
/* It means bufptr->latch_mode was leaked by the previous holder, since there should be no user except me in
* SA_MODE. */
assert (0);
is_page_idle = true;
}
}
#endif
if (is_page_idle == true)
{
old_impl.impl.waiter_exists = false;
old_impl.impl.latch_mode = PGBUF_NO_LATCH;
old_impl.impl.fcnt = 0;
new_impl = old_impl;
new_impl.impl.latch_mode = request_mode;
new_impl.impl.fcnt = 1;
}
else
{
/* the caller is holding bufptr->mutex */
/* Check if we can grant latch immediately or need to block */
/* Case 1: READ request on READ-latched page without waiters - can grant immediately */
if (request_mode == PGBUF_LATCH_READ && old_impl.impl.latch_mode == PGBUF_LATCH_READ)
{
if (!old_impl.impl.waiter_exists)
{
can_latch = true;
new_impl.impl.fcnt++;
}
else
{
/* some waiters exists, check i'm the owner */
if (holder == NULL)
{
/* i'm not the owner, need to block */
can_latch = false;
/* waiter_exists is already true */
}
else
{
/* i'm the owner, grant it */
can_latch = true;
new_impl.impl.fcnt++;
}
}
}
else
{
/* Case 2: Caller is already a holder */
if (holder != NULL)
{
/* Sub-case 2-1: Page is WRITE-latched by holder - can upgrade/regrant */
if (old_impl.impl.latch_mode == PGBUF_LATCH_WRITE)
{
can_latch = true;
new_impl.impl.fcnt++;
}
/* Sub-case 2-2: Page is READ-latched and requesting WRITE latch (promotion) */
else if (old_impl.impl.latch_mode == PGBUF_LATCH_READ)
{
/* If holder is the only one with fix count, can promote to WRITE */
if (old_impl.impl.fcnt == holder->fix_count)
{
can_latch = true;
new_impl.impl.latch_mode = request_mode;
new_impl.impl.fcnt = 1;
}
else
{
/* Other readers exist - need to release holder's READ latch first */
if (condition == PGBUF_CONDITIONAL_LATCH)
{
/* Conditional latch fails */
can_latch = false;
new_impl.impl.waiter_exists = true;
}
else
{
promote_needed = true;
new_impl.impl.fcnt -= holder->fix_count;
can_latch = false;
new_impl.impl.waiter_exists = true;
}
}
}
}
else
{
/* Case 3: Caller is not a holder - need to block and wait */
can_latch = false;
new_impl.impl.waiter_exists = true;
}
}
}
}
while (!bufptr->atomic_latch.
compare_exchange_strong (old_impl.raw, new_impl.raw, std::memory_order::memory_order_acq_rel,
std::memory_order::memory_order_acquire));
// *INDENT-ON*
buf_is_dirty = pgbuf_bcb_is_dirty (bufptr);
if (is_page_idle)
{
PGBUF_BCB_UNLOCK (bufptr);
unlock_BCB.release ();
/* allocate a BCB holder entry */
assert (pgbuf_find_thrd_holder (thread_p, bufptr) == NULL);
holder = pgbuf_allocate_thrd_holder_entry (thread_p);
if (holder == NULL)
{
/* This situation must not be occurred. */
assert (false);
return ER_FAILED;
}
holder->fix_count = 1;
holder->bufptr = bufptr;
holder->perf_stat.dirtied_by_holder = 0;
if (request_mode == PGBUF_LATCH_WRITE)
{
holder->perf_stat.hold_has_write_latch = 1;
holder->perf_stat.hold_has_read_latch = 0;
}
else
{
holder->perf_stat.hold_has_read_latch = 1;
holder->perf_stat.hold_has_write_latch = 0;
}
holder->perf_stat.dirty_before_hold = buf_is_dirty;
#if defined(SERVER_MODE)
bufptr->latch_last_thread = thread_p;
#endif /* SERVER_MODE */
return NO_ERROR;
}
if (can_latch)
{
PGBUF_BCB_UNLOCK (bufptr);
unlock_BCB.release ();
if (holder != NULL)
{
holder->fix_count++;
/* holder->dirty_before_holder not changed */
if (request_mode == PGBUF_LATCH_WRITE)
{
holder->perf_stat.hold_has_write_latch = 1;
}
else
{
holder->perf_stat.hold_has_read_latch = 1;
}
}
#if defined(SERVER_MODE)
else
{
/* the caller is not the holder of the buffer page */
/* allocate a BCB holder entry */
holder = pgbuf_allocate_thrd_holder_entry (thread_p);
if (holder == NULL)
{
/* This situation must not be occurred. */
assert (false);
return ER_FAILED;
}
holder->fix_count = 1;
holder->bufptr = bufptr;
holder->perf_stat.hold_has_read_latch = 1;
holder->perf_stat.hold_has_write_latch = 0;
holder->perf_stat.dirtied_by_holder = 0;
holder->perf_stat.dirty_before_hold = buf_is_dirty;
}
#endif /* SERVER_MODE */
#if defined(SERVER_MODE)
bufptr->latch_last_thread = thread_p;
#endif /* SERVER_MODE */
return NO_ERROR;
}
if (promote_needed)
{
/* Release current READ latch to prepare for WRITE latch */
request_fcnt += holder->fix_count;
holder->fix_count = 0;
INIT_HOLDER_STAT (&holder->perf_stat);
if (pgbuf_remove_thrd_holder (thread_p, holder) != NO_ERROR)
{
/* This situation must not be occurred. */
assert (false);
return ER_FAILED;
}
}
#if defined (SA_MODE)
assert (0);
#endif
if (condition == PGBUF_CONDITIONAL_LATCH)
{
/* reject the request */
int tran_index;
int wait_msec;
tran_index = LOG_FIND_THREAD_TRAN_INDEX (thread_p);
wait_msec = logtb_find_wait_msecs (tran_index);
if (wait_msec == LK_ZERO_WAIT)
{
const char *client_prog_name; /* Client program name for tran */
const char *client_user_name; /* Client user name for tran */
const char *client_host_name; /* Client host for tran */
int client_pid; /* Client process identifier for tran */
/* setup timeout error, if wait_msec == LK_ZERO_WAIT */
unlock_BCB.release ();
PGBUF_BCB_UNLOCK (bufptr);
(void) logtb_find_client_name_host_pid (tran_index, &client_prog_name, &client_user_name, &client_host_name,
&client_pid);
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_LK_PAGE_TIMEOUT, 8, tran_index, client_user_name,
client_host_name, client_pid, (request_mode == PGBUF_LATCH_READ ? "READ" : "WRITE"),
bufptr->vpid.volid, bufptr->vpid.pageid, NULL);
}
else
{
unlock_BCB.release ();
PGBUF_BCB_UNLOCK (bufptr);
}
return ER_FAILED;
}
else
{
/* block the request */
unlock_BCB.release ();
if (pgbuf_block_bcb (thread_p, bufptr, request_mode, request_fcnt, false) != NO_ERROR)
{
return ER_FAILED;
}
/* Above function released bufptr->mutex unconditionally */
assert (pgbuf_find_thrd_holder (thread_p, bufptr) == NULL);
holder = pgbuf_allocate_thrd_holder_entry (thread_p);
if (holder == NULL)
{
/* This situation must not be occurred. */
assert (false);
return ER_FAILED;
}
/* set BCB holder entry */
holder->fix_count = request_fcnt;
holder->bufptr = bufptr;
if (request_mode == PGBUF_LATCH_WRITE)
{
holder->perf_stat.hold_has_write_latch = 1;
}
else if (request_mode == PGBUF_LATCH_READ)
{
holder->perf_stat.hold_has_read_latch = 1;
}
holder->perf_stat.dirtied_by_holder = 0;
holder->perf_stat.dirty_before_hold = buf_is_dirty;
*is_latch_wait = true;
#if defined(SERVER_MODE)
bufptr->latch_last_thread = thread_p;
#endif /* SERVER_MODE */
return NO_ERROR;
}
}
/*
* pgbuf_unlatch_bcb_upon_unfix () - Unlatches BCB
* return: NO_ERROR, or ER_code
* bufptr(in):
*
* Note: It decrements FixCount by one.
* If FixCount becomes 0,
* (1) if LatchMode != FLUSH and LatchMode != VICTIM,
* set LatchMode = NO_LATCH.
* (2) if BCB waiting queue is empty and Wait is false,
* replace the BCB to the top of LRU list.
* If Flush_Request == TRUE,
* set LatchMode = FLUSH,
* flush the buffer by WAL protocol and wake up
* threads on the BCB waiting queue.
* If Flush_Request == FALSE
* if LatchMode == NO_LATCH,
* then, wake up the threads on the BCB waiting queue.
* Before return, it releases BCB mutex.
*/
STATIC_INLINE int
pgbuf_unlatch_bcb_upon_unfix (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, int holder_status)
{
PAGE_PTR pgptr;
int th_lru_idx;
PGBUF_ZONE zone;
int error_code = NO_ERROR;
PGBUF_ATOMIC_LATCH_IMPL impl_orig, impl_new;
bool blocked_reader_writer = false, is_zero_fcnt = false;
assert (holder_status == NO_ERROR);
/* the caller is holding bufptr->mutex */
assert (!VPID_ISNULL (&bufptr->vpid));
assert (pgbuf_check_bcb_page_vpid (bufptr, false) == true);
CAST_BFPTR_TO_PGPTR (pgptr, bufptr);
/* decrement the fix count */
do
{
blocked_reader_writer = false;
is_zero_fcnt = false;
impl_orig = get_impl (&bufptr->atomic_latch);
impl_new = impl_orig;
impl_new.impl.fcnt--;
blocked_reader_writer = impl_orig.impl.waiter_exists;
if (impl_new.impl.fcnt == 0)
{
is_zero_fcnt = true;
impl_new.impl.latch_mode = PGBUF_NO_LATCH;
}
if (impl_new.impl.fcnt < 0)
{
/* This situation must not be occurred. */
assert (false);
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_PB_UNFIXED_PAGEPTR, 3, pgptr, bufptr->vpid.pageid,
fileio_get_volume_label (bufptr->vpid.volid, PEEK));
impl_new.impl.latch_mode = PGBUF_NO_LATCH;
impl_new.impl.fcnt = 0;
impl_new.impl.waiter_exists = false;
is_zero_fcnt = true;
break;
}
}
while (!bufptr->atomic_latch.compare_exchange_weak (impl_orig.raw, impl_new.raw, std::memory_order_acq_rel,
std::memory_order_acquire));
if (holder_status != NO_ERROR)
{
/* This situation must not be occurred. */
assert (false);
PGBUF_BCB_UNLOCK (bufptr);
return ER_FAILED;
}
if (is_zero_fcnt)
{
/* When oldest_unflush_lsa of a page is set, its dirty mark should also be set */
assert (LSA_ISNULL (&bufptr->oldest_unflush_lsa) || pgbuf_bcb_is_dirty (bufptr));
/* there could be some synchronous flushers on the BCB queue */
/* When the page buffer in LRU_1_Zone, do not move the page buffer into the top of LRU. This is an intention for
* performance. */
if (pgbuf_bcb_should_be_moved_to_bottom_lru (bufptr))
{
pgbuf_move_bcb_to_bottom_lru (thread_p, bufptr);
}
else if (blocked_reader_writer == false)
{
pgbuf_Pool.monitor.pg_unfix_cnt.fetch_add (1, std::memory_order_relaxed);
if (PGBUF_THREAD_HAS_PRIVATE_LRU (thread_p))
{
th_lru_idx = PGBUF_LRU_INDEX_FROM_PRIVATE (PGBUF_PRIVATE_LRU_FROM_THREAD (thread_p));
}
else
{
th_lru_idx = -1;
}
zone = pgbuf_bcb_get_zone (bufptr);
switch (zone)
{
case PGBUF_VOID_ZONE:
/* bcb was recently allocated. the case may vary from never being used (or almost never), to up to few
* percent (when hit ratio is very low). in any case, this is not needed to be very optimized here,
* so the code was moved outside unlatch... do not inline it */
pgbuf_unlatch_void_zone_bcb (thread_p, bufptr, th_lru_idx);
break;
case PGBUF_LRU_1_ZONE:
/* note: this is most often accessed code and must be highly optimized! */
if (PGBUF_SHOULD_IGNORE_UNFIX (thread_p, bufptr))
{
/* do nothing */
/* ... except collecting statistics */
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_LRU_ONE_KEEP_VAC);
break;
}
if (pgbuf_should_move_private_to_shared (thread_p, bufptr, th_lru_idx))
{
/* move to shared */
pgbuf_lru_move_from_private_to_shared (thread_p, bufptr);
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_LRU_ONE_PRV_TO_SHR_MID);
break;
}
/* do not move or boost */
if (PGBUF_IS_PRIVATE_LRU_INDEX (pgbuf_bcb_get_lru_index (bufptr)))
{
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_LRU_ONE_PRV_KEEP);
}
else
{
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_LRU_ONE_SHR_KEEP);
}
pgbuf_bcb_register_hit_for_lru (bufptr);
break;
case PGBUF_LRU_2_ZONE:
/* this is the buffer zone between hot and victimized. is less hot than zone one and we allow boosting
* (if bcb's are old enough). */
if (PGBUF_SHOULD_IGNORE_UNFIX (thread_p, bufptr))
{
/* do nothing */
/* ... except collecting statistics */
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_LRU_TWO_KEEP_VAC);
break;
}
if (pgbuf_should_move_private_to_shared (thread_p, bufptr, th_lru_idx))
{
/* move to shared */
pgbuf_lru_move_from_private_to_shared (thread_p, bufptr);
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_LRU_TWO_PRV_TO_SHR_MID);
break;
}
if (PGBUF_IS_BCB_OLD_ENOUGH (bufptr, pgbuf_lru_list_from_bcb (bufptr)))
{
/* boost */
pgbuf_lru_boost_bcb (thread_p, bufptr);
}
else
{
/* bcb is too new to tell if it really deserves a boost */
if (PGBUF_IS_PRIVATE_LRU_INDEX (pgbuf_bcb_get_lru_index (bufptr)))
{
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_LRU_TWO_PRV_KEEP);
}
else
{
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_LRU_TWO_SHR_KEEP);
}
}
pgbuf_bcb_register_hit_for_lru (bufptr);
break;
case PGBUF_LRU_3_ZONE:
if (PGBUF_SHOULD_IGNORE_UNFIX (thread_p, bufptr))
{
if (!pgbuf_bcb_avoid_victim (bufptr) && pgbuf_assign_direct_victim (thread_p, bufptr))
{
/* assigned victim directly */
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
perfmon_inc_stat (thread_p, PSTAT_PB_VICTIM_ASSIGN_DIRECT_VACUUM_LRU);
}
}
else
{
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_LRU_THREE_KEEP_VAC);
}
break;
}
if (pgbuf_should_move_private_to_shared (thread_p, bufptr, th_lru_idx))
{
/* move to shared */
pgbuf_lru_move_from_private_to_shared (thread_p, bufptr);
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_LRU_THREE_PRV_TO_SHR_MID);
break;
}
/* boost */
pgbuf_lru_boost_bcb (thread_p, bufptr);
pgbuf_bcb_register_hit_for_lru (bufptr);
break;
default:
/* unexpected */
assert (false);
break;
}
}
#if defined(SERVER_MODE)
pgbuf_wakeup_reader_writer (thread_p, bufptr);
#endif /* SERVER_MODE */
}
assert (get_latch (&bufptr->atomic_latch) != PGBUF_LATCH_FLUSH);
if (pgbuf_bcb_is_async_flush_request (bufptr))
{
/* PGBUF_LATCH_READ is possible, when a reader and a flusher was blocked by a writer.
* Blocked readers are already wakened by the ex-owner.
*/
assert (get_fcnt (&bufptr->atomic_latch) == 0 || get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_WRITE
|| get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_READ);
/* we need to flush bcb. we won't need the bcb mutex afterwards */
error_code = pgbuf_bcb_safe_flush_force_unlock (thread_p, bufptr, false);
/* what to do with the error? we failed to flush it... */
if (error_code != NO_ERROR)
{
er_clear ();
error_code = NO_ERROR;
}
}
else
{
PGBUF_BCB_UNLOCK (bufptr);
}
return NO_ERROR;
}
/*
* pgbuf_unlatch_void_zone_bcb () - unlatch bcb that is currently in void zone.
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : void zone bcb to unlatch
* thread_private_lru_index (in) : thread's private lru index. -1 if thread does not have any private list.
*
* note: this is part of unlatch/unfix algorithm.
*/
static void
pgbuf_unlatch_void_zone_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int thread_private_lru_index)
{
bool aout_enabled = false;
int aout_list_id = PGBUF_AOUT_NOT_FOUND;
assert (pgbuf_bcb_get_zone (bcb) == PGBUF_VOID_ZONE);
if (pgbuf_Pool.buf_AOUT_list.max_count > 0)
{
aout_enabled = true;
aout_list_id = pgbuf_remove_vpid_from_aout_list (thread_p, &bcb->vpid);
}
if (PGBUF_VACUUM_SHOULD_IGNORE_UNFIX (thread_p))
{
/* we are not registering unfix for activity and we are not boosting or moving bcb's */
if (aout_list_id == PGBUF_AOUT_NOT_FOUND)
{
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_VOID_AOUT_NOT_FOUND_VAC);
}
else
{
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_VOID_AOUT_FOUND_VAC);
}
/* can we feed direct victims? */
if (!pgbuf_bcb_avoid_victim (bcb) && pgbuf_assign_direct_victim (thread_p, bcb))
{
/* assigned victim directly */
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
perfmon_inc_stat (thread_p, PSTAT_PB_VICTIM_ASSIGN_DIRECT_VACUUM_VOID);
}
/* add to AOUT */
if (pgbuf_Pool.buf_AOUT_list.max_count > 0)
{
pgbuf_add_vpid_to_aout_list (thread_p, &bcb->vpid, aout_list_id);
}
return;
}
/* reset aout_list_id */
aout_list_id = PGBUF_AOUT_NOT_FOUND;
}
else
{
if (aout_list_id == PGBUF_AOUT_NOT_FOUND)
{
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_VOID_AOUT_NOT_FOUND);
}
else
{
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_VOID_AOUT_FOUND);
}
}
if (thread_private_lru_index != -1)
{
if (PGBUF_VACUUM_SHOULD_IGNORE_UNFIX (thread_p))
{
/* add to top of current private list */
pgbuf_lru_add_new_bcb_to_top (thread_p, bcb, thread_private_lru_index);
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_VOID_TO_PRIVATE_TOP_VAC);
return;
}
if (!aout_enabled || thread_private_lru_index == aout_list_id)
{
/* add to top of current private list */
pgbuf_lru_add_new_bcb_to_top (thread_p, bcb, thread_private_lru_index);
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_VOID_TO_PRIVATE_TOP);
pgbuf_bcb_register_hit_for_lru (bcb);
return;
}
if (aout_list_id == PGBUF_AOUT_NOT_FOUND)
{
/* add to middle of current private list */
pgbuf_lru_add_new_bcb_to_middle (thread_p, bcb, thread_private_lru_index);
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_VOID_TO_PRIVATE_MID);
pgbuf_bcb_register_hit_for_lru (bcb);
return;
}
/* fall through to add to shared */
}
/* add to middle of shared list. */
pgbuf_lru_add_new_bcb_to_middle (thread_p, bcb, pgbuf_get_shared_lru_index_for_add ());
perfmon_inc_stat (thread_p, PSTAT_PB_UNFIX_VOID_TO_SHARED_MID);
if (!PGBUF_VACUUM_SHOULD_IGNORE_UNFIX (thread_p))
{
pgbuf_bcb_register_hit_for_lru (bcb);
}
}
/*
* pgbuf_should_move_private_to_shared () - return true if bcb belongs to private lru list and if should be moved to a
* shared lru list.
*
* return : true if move from private to shared is needed.
* thread_p (in) : thread entry
* bcb (in) : bcb
* thread_private_lru_index (in) : thread's private lru index. -1 if thread does not have any private list.
*/
STATIC_INLINE bool
pgbuf_should_move_private_to_shared (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int thread_private_lru_index)
{
int bcb_lru_idx = pgbuf_bcb_get_lru_index (bcb);
if (PGBUF_IS_SHARED_LRU_INDEX (bcb_lru_idx))
{
/* not a private list */
return false;
}
/* two conditions to move from private to shared:
* 1. bcb is fixed by more than one transaction.
* 2. bcb is very hot and old enough. */
/* cond 1 */
if (thread_private_lru_index != bcb_lru_idx)
{
return true;
}
/* cond 2 */
if (!pgbuf_bcb_is_hot (bcb))
{
/* not hot enough */
return false;
}
if (!PGBUF_IS_BCB_OLD_ENOUGH (bcb, PGBUF_GET_LRU_LIST (bcb_lru_idx)))
{
/* not old enough */
return false;
}
/* hot and old enough */
return true;
}
/*
* pgbuf_block_bcb () - Adds it on the BCB waiting queue and block thread
* return: NO_ERROR, or ER_code
* bufptr(in):
* request_mode(in):
* request_fcnt(in):
* as_promote(in): if true, will wait as first promoter
*
* Note: Promoter will be the first waiter. Others will be appended to waiting queue.
*/
static int
pgbuf_block_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, PGBUF_LATCH_MODE request_mode, int request_fcnt,
bool as_promote)
{
#if defined(SERVER_MODE)
THREAD_ENTRY *cur_thrd_entry, *thrd_entry;
assert (get_waiter_exists (&bufptr->atomic_latch) == true);
/* caller is holding bufptr->mutex */
/* request_mode == PGBUF_LATCH_READ/PGBUF_LATCH_WRITE/PGBUF_LATCH_FLUSH */
assert (request_mode == PGBUF_LATCH_READ || request_mode == PGBUF_LATCH_WRITE || request_mode == PGBUF_LATCH_FLUSH);
if (thread_p == NULL)
{
assert (thread_p != NULL);
thread_p = thread_get_thread_entry_info ();
}
cur_thrd_entry = thread_p;
cur_thrd_entry->request_latch_mode = request_mode;
cur_thrd_entry->request_fix_count = request_fcnt; /* SPECIAL_NOTE */
if (as_promote)
{
/* place cur_thrd_entry as first in BCB waiting queue */
/* Safe guard: there can be only one promoter. */
assert (bufptr->next_wait_thrd == NULL || !bufptr->next_wait_thrd->wait_for_latch_promote);
cur_thrd_entry->next_wait_thrd = bufptr->next_wait_thrd;
bufptr->next_wait_thrd = cur_thrd_entry;
}
else
{
/* append cur_thrd_entry to the BCB waiting queue */
cur_thrd_entry->next_wait_thrd = NULL;
thrd_entry = bufptr->next_wait_thrd;
if (thrd_entry == NULL)
{
bufptr->next_wait_thrd = cur_thrd_entry;
}
else
{
while (thrd_entry->next_wait_thrd != NULL)
{
thrd_entry = thrd_entry->next_wait_thrd;
}
thrd_entry->next_wait_thrd = cur_thrd_entry;
}
}
if (request_mode == PGBUF_LATCH_FLUSH)
{
/* is it safe to use infinite wait instead of timed sleep? */
thread_lock_entry (cur_thrd_entry);
PGBUF_BCB_UNLOCK (bufptr);
thread_suspend_wakeup_and_unlock_entry (cur_thrd_entry, THREAD_PGBUF_SUSPENDED);
if (cur_thrd_entry->resume_status != THREAD_PGBUF_RESUMED)
{
/* interrupt operation */
THREAD_ENTRY *thrd_entry, *prev_thrd_entry = NULL;
PGBUF_BCB_LOCK (bufptr);
thrd_entry = bufptr->next_wait_thrd;
while (thrd_entry != NULL)
{
if (thrd_entry == cur_thrd_entry)
{
if (prev_thrd_entry == NULL)
{
bufptr->next_wait_thrd = thrd_entry->next_wait_thrd;
}
else
{
prev_thrd_entry->next_wait_thrd = thrd_entry->next_wait_thrd;
}
thrd_entry->next_wait_thrd = NULL;
PGBUF_BCB_UNLOCK (bufptr);
return ER_FAILED;
}
prev_thrd_entry = thrd_entry;
thrd_entry = thrd_entry->next_wait_thrd;
}
PGBUF_BCB_UNLOCK (bufptr);
}
}
else
{
/*
* We do not guarantee that there is no deadlock between page latches.
* So, we made a decision that when read/write buffer fix request is
* not granted immediately, block the request with timed sleep method.
* That is, unless the request is not waken up by other threads within
* some time interval, the request will be waken up by timeout.
* When the request is waken up, the request is treated as a victim.
*/
if (pgbuf_timed_sleep (cur_thrd_entry, bufptr) != NO_ERROR)
{
return ER_FAILED;
}
#if !defined (NDEBUG)
/* To hold mutex is not required because I hold the latch. This means at least my fix count is kept. */
assert (0 < get_fcnt (&bufptr->atomic_latch));
#endif
}
bufptr->latch_last_thread = thread_p;
#endif /* SERVER_MODE */
return NO_ERROR;
}
#if defined(SERVER_MODE)
/*
* pgbuf_timed_sleep_error_handling () -
* return:
* thrd_entry(in):
* bufptr(in):
*/
static int
pgbuf_timed_sleep_error_handling (THREAD_ENTRY * thrd_entry, PGBUF_BCB * bufptr)
{
THREAD_ENTRY *prev_thrd_entry;
THREAD_ENTRY *curr_thrd_entry;
PGBUF_ATOMIC_LATCH_IMPL impl, impl_new;
bool can_grant = false;
PGBUF_BCB_LOCK (bufptr);
/* case 1 : empty waiting queue */
if (bufptr->next_wait_thrd == NULL)
{
/* The thread entry has been already removed from the BCB waiting queue by another thread. */
return NO_ERROR;
}
/* case 2 : first waiting thread != thrd_entry */
if (bufptr->next_wait_thrd != thrd_entry)
{
prev_thrd_entry = bufptr->next_wait_thrd;
while (prev_thrd_entry->next_wait_thrd != NULL)
{
if (prev_thrd_entry->next_wait_thrd == thrd_entry)
{
prev_thrd_entry->next_wait_thrd = thrd_entry->next_wait_thrd;
thrd_entry->next_wait_thrd = NULL;
break;
}
prev_thrd_entry = prev_thrd_entry->next_wait_thrd;
}
return NO_ERROR;
}
/* case 3 : first waiting thread == thrd_entry */
bufptr->next_wait_thrd = thrd_entry->next_wait_thrd;
thrd_entry->next_wait_thrd = NULL;
while (bufptr->next_wait_thrd != NULL)
{
curr_thrd_entry = bufptr->next_wait_thrd;
do
{
can_grant = false;
impl = get_impl (&bufptr->atomic_latch);
impl_new = impl;
if (impl.impl.latch_mode == PGBUF_LATCH_READ && curr_thrd_entry->request_latch_mode == PGBUF_LATCH_READ)
{
can_grant = true;
impl_new.impl.fcnt += curr_thrd_entry->request_fix_count;
}
}
while (!bufptr->atomic_latch.compare_exchange_weak (impl.raw, impl_new.raw, std::memory_order_acq_rel,
std::memory_order_acquire));
if (can_grant)
{
/* grant the request */
thread_lock_entry (curr_thrd_entry);
if (curr_thrd_entry->request_latch_mode == PGBUF_LATCH_READ)
{
/* do not handle BCB holder entry, at here. refer pgbuf_latch_bcb_upon_fix () */
/* remove thrd_entry from BCB waiting queue. */
bufptr->next_wait_thrd = curr_thrd_entry->next_wait_thrd;
curr_thrd_entry->next_wait_thrd = NULL;
/* wake up the thread */
pgbuf_wakeup (curr_thrd_entry);
}
else
{
thread_unlock_entry (curr_thrd_entry);
break;
}
}
else
{
break;
}
}
return NO_ERROR;
}
/*
* pgbuf_timed_sleep () -
* return: NO_ERROR, or ER_code
* thread_p(in):
* bufptr(in):
*/
static int
pgbuf_timed_sleep (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
{
int r;
struct timespec to;
int wait_secs;
int old_wait_msecs;
int save_request_latch_mode;
const char *client_prog_name; /* Client program name for trans */
const char *client_user_name; /* Client user name for tran */
const char *client_host_name; /* Client host for tran */
int client_pid; /* Client process identifier for tran */
bool old_check_interrupt = false;
/* After holding the mutex associated with conditional variable, release the bufptr->mutex. */
thread_lock_entry (thread_p);
PGBUF_BCB_UNLOCK (bufptr);
old_wait_msecs = wait_secs = pgbuf_find_current_wait_msecs (thread_p);
assert (wait_secs == LK_INFINITE_WAIT || wait_secs == LK_ZERO_WAIT || wait_secs == LK_FORCE_ZERO_WAIT
|| wait_secs > 0);
if (wait_secs == LK_ZERO_WAIT || wait_secs == LK_FORCE_ZERO_WAIT)
{
wait_secs = 0;
}
else
{
wait_secs = pgbuf_latch_timeout;
}
try_again:
to.tv_sec = (int) time (NULL) + wait_secs;
to.tv_nsec = 0;
if (thread_p->type == TT_WORKER)
{
old_check_interrupt = logtb_set_check_interrupt (thread_p, true);
}
thread_p->resume_status = THREAD_PGBUF_SUSPENDED;
r = thread_suspend_timeout_wakeup_and_unlock_entry (thread_p, &to, THREAD_PGBUF_SUSPENDED);
if (thread_p->type == TT_WORKER)
{
logtb_set_check_interrupt (thread_p, old_check_interrupt);
}
if (r == NO_ERROR)
{
thread_lock_entry (thread_p);
/* someone wakes up me */
if (thread_p->resume_status == THREAD_PGBUF_RESUMED)
{
thread_unlock_entry (thread_p);
return NO_ERROR;
}
/* interrupt operation */
thread_p->request_latch_mode = PGBUF_NO_LATCH;
thread_unlock_entry (thread_p);
if (pgbuf_timed_sleep_error_handling (thread_p, bufptr) == NO_ERROR)
{
PGBUF_BCB_UNLOCK (bufptr);
}
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_INTERRUPTED, 0);
return ER_FAILED;
}
else if (r == ER_CSS_PTHREAD_COND_TIMEDOUT)
{
/* rollback operation, postpone operation, etc. */
if (thread_p->resume_status == THREAD_PGBUF_RESUMED)
{
thread_unlock_entry (thread_p);
return NO_ERROR;
}
if (logtb_is_current_active (thread_p) == false)
{
goto try_again;
}
/* buffer page deadlock victim by timeout */
/* following order of execution is important. */
/* request_latch_mode == PGBUF_NO_LATCH means that the thread has waken up by timeout. This value must be set
* before release the mutex. */
save_request_latch_mode = thread_p->request_latch_mode;
thread_p->request_latch_mode = PGBUF_NO_LATCH;
thread_unlock_entry (thread_p);
if (pgbuf_timed_sleep_error_handling (thread_p, bufptr) == NO_ERROR)
{
goto er_set_return;
}
return ER_FAILED;
}
else
{
thread_unlock_entry (thread_p);
/* error setting */
er_set_with_oserror (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_CSS_PTHREAD_COND_TIMEDWAIT, 0);
return ER_FAILED;
}
er_set_return:
/* error setting */
if (old_wait_msecs == LK_INFINITE_WAIT)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_PAGE_LATCH_TIMEDOUT, 2, bufptr->vpid.volid, bufptr->vpid.pageid);
/* FIXME: remove it. temporarily added for debugging */
assert (0);
PGBUF_BCB_UNLOCK (bufptr);
if (logtb_is_current_active (thread_p) == true)
{
const char *client_prog_name; /* Client user name for transaction */
const char *client_user_name; /* Client user name for transaction */
const char *client_host_name; /* Client host for transaction */
int client_pid; /* Client process identifier for transaction */
int tran_index;
tran_index = LOG_FIND_THREAD_TRAN_INDEX (thread_p);
(void) logtb_find_client_name_host_pid (tran_index, &client_prog_name, &client_user_name, &client_host_name,
&client_pid);
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_LK_UNILATERALLY_ABORTED, 4, tran_index, client_user_name,
client_host_name, client_pid);
}
else
{
/*
* We are already aborting, fall through. Don't do
* double aborts that could cause an infinite loop.
*/
er_log_debug (ARG_FILE_LINE,
"pgbuf_timed_sleep: Likely a system error. Trying to abort a transaction twice.\n");
/* We can release all the page latches held by current thread. */
}
}
else if (old_wait_msecs > 0)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_PAGE_LATCH_TIMEDOUT, 2, bufptr->vpid.volid, bufptr->vpid.pageid);
PGBUF_BCB_UNLOCK (bufptr);
(void) logtb_find_client_name_host_pid (thread_p->tran_index, &client_prog_name, &client_user_name,
&client_host_name, &client_pid);
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_LK_PAGE_TIMEOUT, 8, thread_p->tran_index, client_user_name,
client_host_name, client_pid, (save_request_latch_mode == PGBUF_LATCH_READ ? "READ" : "WRITE"),
bufptr->vpid.volid, bufptr->vpid.pageid, NULL);
}
else
{
PGBUF_BCB_UNLOCK (bufptr);
}
return ER_FAILED;
}
/*
* pgbuf_wakeup_reader_writer () - Wakes up blocked threads on the BCB queue with read or write latch mode
*
* return : error code
* thread_p (in) : thread entry
* bufptr (in) : bcb
*/
STATIC_INLINE void
pgbuf_wakeup_reader_writer (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
{
THREAD_ENTRY *thrd_entry = NULL;
THREAD_ENTRY *prev_thrd_entry = NULL;
THREAD_ENTRY *next_thrd_entry = NULL;
PGBUF_ATOMIC_LATCH_IMPL impl = get_impl (&bufptr->atomic_latch), impl_new;
bool can_grant = false, should_stop = false;
/* the caller is holding bufptr->mutex */
#if !defined (NDEBUG)
assert (impl.impl.latch_mode == PGBUF_NO_LATCH && impl.impl.fcnt == 0);
#endif
/* fcnt == 0, bufptr->latch_mode == PGBUF_NO_LATCH */
/* how it works:
*
* we can have here multiple types of waiters:
* 1. PGBUF_NO_LATCH - thread gave up waiting for bcb (interrupted or timed out). just remove it from list.
* 2. PGBUF_LATCH_FLUSH - thread is waiting for bcb to be flushed. this is not actually a latch and thread is not
* awaken here. bcb must be either marked to be flushed asynchronously or is currently in process of being flushed.
* 3. PGBUF_LATCH_READ - multiple threads can be waked at once (all readers at the head of the list).
* 4. PGBUF_LATCH_WRITE - only first waiter is waked.
*/
for (thrd_entry = bufptr->next_wait_thrd; thrd_entry != NULL; thrd_entry = next_thrd_entry)
{
next_thrd_entry = thrd_entry->next_wait_thrd;
/* if thrd_entry->request_latch_mode is PGBUF_NO_LATCH, it means the corresponding thread has been waken up
* by timeout. */
if (thrd_entry->request_latch_mode == PGBUF_NO_LATCH)
{
if (prev_thrd_entry == NULL)
{
bufptr->next_wait_thrd = next_thrd_entry;
}
else
{
prev_thrd_entry->next_wait_thrd = next_thrd_entry;
}
thrd_entry->next_wait_thrd = NULL;
continue;
}
if (thrd_entry->request_latch_mode == PGBUF_LATCH_FLUSH)
{
/* must wait for flush. we do not wake it until flush is executed. */
assert (pgbuf_bcb_is_async_flush_request (bufptr) || pgbuf_bcb_is_flushing (bufptr));
/* leave it in the wait list */
prev_thrd_entry = thrd_entry;
continue;
}
do
{
can_grant = false;
impl = get_impl (&bufptr->atomic_latch);
impl_new = impl;
if (impl.impl.latch_mode == PGBUF_NO_LATCH
|| (impl.impl.latch_mode == PGBUF_LATCH_READ && thrd_entry->request_latch_mode == PGBUF_LATCH_READ))
{
thread_lock_entry (thrd_entry);
if (thrd_entry->request_latch_mode == PGBUF_NO_LATCH)
{
can_grant = false;
if (prev_thrd_entry == NULL)
{
bufptr->next_wait_thrd = next_thrd_entry;
}
else
{
prev_thrd_entry->next_wait_thrd = next_thrd_entry;
}
thrd_entry->next_wait_thrd = NULL;
thread_unlock_entry (thrd_entry);
break;
}
can_grant = true;
impl_new.impl.fcnt += thrd_entry->request_fix_count;
impl_new.impl.latch_mode = (PGBUF_LATCH_MODE) (uint16_t) thrd_entry->request_latch_mode;
assert_release (impl_new.impl.latch_mode == PGBUF_NO_LATCH || impl_new.impl.latch_mode == PGBUF_LATCH_READ
|| impl_new.impl.latch_mode == PGBUF_LATCH_WRITE
|| impl_new.impl.latch_mode == PGBUF_LATCH_FLUSH);
}
else if (impl.impl.latch_mode == PGBUF_LATCH_READ)
{
/* Look for other readers. */
prev_thrd_entry = thrd_entry;
break;
}
else
{
assert (impl.impl.latch_mode == PGBUF_LATCH_WRITE);
should_stop = true;
break;
}
}
while (!bufptr->atomic_latch.compare_exchange_strong (impl.raw, impl_new.raw, std::memory_order_acq_rel,
std::memory_order_acquire));
if (should_stop)
{
break;
}
if (can_grant)
{
/* do not handle BCB holder entry, at here. refer pgbuf_latch_bcb_upon_fix () */
/* remove thrd_entry from BCB waiting queue. */
if (prev_thrd_entry == NULL)
{
bufptr->next_wait_thrd = next_thrd_entry;
}
else
{
prev_thrd_entry->next_wait_thrd = next_thrd_entry;
}
thrd_entry->next_wait_thrd = NULL;
/* wake up the thread */
pgbuf_wakeup (thrd_entry);
}
}
if (!pgbuf_is_exist_blocked_reader_writer (bufptr))
{
set_waiter_exists (&bufptr->atomic_latch, false);
}
}
#endif /* SERVER_MODE */
/*
* pgbuf_search_hash_chain () - searches the buffer hash chain to find a BCB with page identifier
* return: if success, BCB pointer, otherwise NULL
* hash_anchor(in):
* vpid(in):
*/
STATIC_INLINE PGBUF_BCB *
pgbuf_search_hash_chain (THREAD_ENTRY * thread_p, PGBUF_BUFFER_HASH * hash_anchor, const VPID * vpid)
{
PGBUF_BCB *bufptr;
#if defined(SERVER_MODE)
int rv;
#endif
TSC_TICKS start_tick, end_tick;
UINT64 lock_wait_time = 0;
/* one_phase: no hash-chain mutex */
one_phase:
bufptr = hash_anchor->hash_next;
while (bufptr != NULL)
{
if (VPID_EQ (&(bufptr->vpid), vpid))
{
#if defined(SERVER_MODE)
rv = PGBUF_BCB_TRYLOCK (bufptr);
if (rv == 0)
{
/* OK. go ahead */
}
else
{
if (rv != EBUSY)
{
/* give up one_phase */
goto two_phase;
}
/* An unconditional request is given for acquiring mutex */
PGBUF_BCB_LOCK (bufptr);
}
#else /* SERVER_MODE */
PGBUF_BCB_LOCK (bufptr);
#endif /* SERVER_MODE */
if (!VPID_EQ (&(bufptr->vpid), vpid))
{
/* updated or replaced */
PGBUF_BCB_UNLOCK (bufptr);
/* retry one_phase */
goto one_phase;
}
break;
}
bufptr = bufptr->hash_next;
}
if (bufptr != NULL)
{
return bufptr;
}
#if defined(SERVER_MODE)
/* two_phase: hold hash-chain mutex */
two_phase:
#endif
try_again:
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_HASH_ANCHOR))
{
tsc_getticks (&start_tick);
}
rv = pthread_mutex_lock (&hash_anchor->hash_mutex);
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_HASH_ANCHOR))
{
tsc_getticks (&end_tick);
lock_wait_time = tsc_elapsed_utime (end_tick, start_tick);
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_HASH_ANCHOR_WAITS);
perfmon_add_stat (thread_p, PSTAT_PB_TIME_HASH_ANCHOR_WAIT, lock_wait_time);
}
bufptr = hash_anchor->hash_next;
while (bufptr != NULL)
{
if (VPID_EQ (&(bufptr->vpid), vpid))
{
#if defined(SERVER_MODE)
rv = PGBUF_BCB_TRYLOCK (bufptr);
if (rv == 0)
{
/* bufptr->mutex is held */
pthread_mutex_unlock (&hash_anchor->hash_mutex);
}
else
{
if (rv != EBUSY)
{
er_set_with_oserror (ER_FATAL_ERROR_SEVERITY, ARG_FILE_LINE, ER_CSS_PTHREAD_MUTEX_TRYLOCK, 0);
return NULL;
}
/* ret == EBUSY : bufptr->mutex is not held */
/* An unconditional request is given for acquiring mutex after releasing hash_mutex. */
pthread_mutex_unlock (&hash_anchor->hash_mutex);
PGBUF_BCB_LOCK (bufptr);
}
#else /* SERVER_MODE */
pthread_mutex_unlock (&hash_anchor->hash_mutex);
PGBUF_BCB_LOCK (bufptr);
#endif /* SERVER_MODE */
if (!VPID_EQ (&(bufptr->vpid), vpid))
{
/* updated or replaced */
PGBUF_BCB_UNLOCK (bufptr);
goto try_again;
}
break;
}
bufptr = bufptr->hash_next;
}
/* at this point, if (bufptr != NULL) caller holds bufptr->mutex but not hash_anchor->hash_mutex if (bufptr ==
* NULL) caller holds hash_anchor->hash_mutex. */
return bufptr;
}
STATIC_INLINE PAGE_PTR
pgbuf_lockfree_fix_ro (THREAD_ENTRY * thread_p, const VPID * vpid, PAGE_FETCH_MODE fetch_mode)
{
assert (fetch_mode == OLD_PAGE || fetch_mode == OLD_PAGE_PREVENT_DEALLOC || fetch_mode == OLD_PAGE_MAYBE_DEALLOCATED);
PGBUF_HOLDER *holder = NULL;
PGBUF_ATOMIC_LATCH_IMPL impl, new_impl;
PGBUF_BCB *bufptr =
pgbuf_search_hash_chain_no_bcb_lock (thread_p, &pgbuf_Pool.buf_hash_table[PGBUF_HASH_VALUE (vpid)], vpid);
PAGE_PTR pgptr = NULL;
if (bufptr == NULL)
{
return NULL;
}
do
{
impl = get_impl (&bufptr->atomic_latch);
new_impl = impl;
if (impl.impl.latch_mode != PGBUF_LATCH_READ || impl.impl.waiter_exists || impl.impl.fcnt == 0
|| bufptr->vpid.pageid != vpid->pageid || bufptr->vpid.volid != vpid->volid)
{
return NULL;
}
new_impl.impl.fcnt++;
}
while (!bufptr->atomic_latch.compare_exchange_weak (impl.raw, new_impl.raw,
std::memory_order_acq_rel, std::memory_order_acquire));
holder = pgbuf_find_thrd_holder (thread_p, bufptr);
if (holder != NULL)
{
/* the caller is the holder of the buffer page */
holder->fix_count++;
/* holder->dirty_before_holder not changed */
holder->perf_stat.hold_has_read_latch = 1;
}
#if defined(SERVER_MODE)
else
{
/* the caller is not the holder of the buffer page */
/* allocate a BCB holder entry */
holder = pgbuf_allocate_thrd_holder_entry (thread_p);
if (holder == NULL)
{
/* This situation must not be occurred. */
assert (false);
return NULL;
}
holder->fix_count = 1;
holder->bufptr = bufptr;
holder->perf_stat.hold_has_read_latch = 1;
holder->perf_stat.hold_has_write_latch = 0;
holder->perf_stat.dirtied_by_holder = 0;
holder->perf_stat.dirty_before_hold = pgbuf_bcb_is_dirty (bufptr);
}
#endif /* SERVER_MODE */
CAST_BFPTR_TO_PGPTR (pgptr, bufptr);
assert (pgptr != NULL);
return pgptr;
}
STATIC_INLINE PGBUF_BCB *
pgbuf_search_hash_chain_no_bcb_lock (THREAD_ENTRY * thread_p, PGBUF_BUFFER_HASH * hash_anchor, const VPID * vpid)
{
PGBUF_BCB *bufptr;
bufptr = hash_anchor->hash_next;
while (bufptr != NULL)
{
if (VPID_EQ (&(bufptr->vpid), vpid))
{
return bufptr;
}
bufptr = bufptr->hash_next;
}
return NULL;
}
STATIC_INLINE bool
pgbuf_lockfree_unfix_ro (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
{
PGBUF_ATOMIC_LATCH_IMPL impl, new_impl;
do
{
impl = get_impl (&bufptr->atomic_latch);
if (impl.impl.latch_mode != PGBUF_LATCH_READ || impl.impl.waiter_exists || impl.impl.fcnt == 1)
{
return false;
}
new_impl = impl;
new_impl.impl.fcnt--;
}
while (!bufptr->atomic_latch.compare_exchange_weak (impl.raw, new_impl.raw,
std::memory_order_acq_rel, std::memory_order_acquire));
#if !defined (NDEBUG)
PAGE_PTR pgptr = NULL;
CAST_BFPTR_TO_PGPTR (pgptr, bufptr);
thread_p->get_pgbuf_tracker ().decrement (pgptr);
#endif // !NDEBUG
return true;
}
/*
* pgbuf_insert_into_hash_chain () - Inserts BCB into the hash chain
* return: NO_ERROR
* hash_anchor(in): hash anchor
* bufptr(in): pointer to buffer page (BCB)
*
* Note: Before insertion, it must hold the mutex of the hash anchor.
* It doesn't release the mutex of the hash anchor.
* The mutex of the hash anchor will be released in the next call of pgbuf_unlock_page ().
*/
STATIC_INLINE int
pgbuf_insert_into_hash_chain (THREAD_ENTRY * thread_p, PGBUF_BUFFER_HASH * hash_anchor, PGBUF_BCB * bufptr)
{
#if defined(SERVER_MODE)
int rv;
#endif /* SERVER_MODE */
TSC_TICKS start_tick, end_tick;
UINT64 lock_wait_time = 0;
if (perfmon_get_activation_flag () & PERFMON_ACTIVATION_FLAG_PB_HASH_ANCHOR)
{
if (perfmon_is_perf_tracking ())
{
tsc_getticks (&start_tick);
}
}
/* Note that the caller is not holding bufptr->mutex */
rv = pthread_mutex_lock (&hash_anchor->hash_mutex);
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_HASH_ANCHOR))
{
tsc_getticks (&end_tick);
lock_wait_time = tsc_elapsed_utime (end_tick, start_tick);
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_HASH_ANCHOR_WAITS);
perfmon_add_stat (thread_p, PSTAT_PB_TIME_HASH_ANCHOR_WAIT, lock_wait_time);
}
bufptr->hash_next = hash_anchor->hash_next;
hash_anchor->hash_next = bufptr;
/*
* hash_anchor->hash_mutex is not released at this place.
* The current BCB is the newly allocated BCB by the caller and
* it is connected into the corresponding buffer hash chain, now.
* hash_anchor->hahs_mutex will be released in pgbuf_unlock_page ()
* after releasing the acquired buffer lock on the BCB.
*/
return NO_ERROR;
}
/*
* pgbuf_delete_from_hash_chain () - Deletes BCB from the hash chain
* return: NO_ERROR, or ER_code
* bufptr(in): pointer to buffer page
*/
STATIC_INLINE int
pgbuf_delete_from_hash_chain (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
{
PGBUF_BUFFER_HASH *hash_anchor;
PGBUF_BCB *prev_bufptr;
PGBUF_BCB *curr_bufptr;
#if defined(SERVER_MODE)
int rv;
#endif /* SERVER_MODE */
TSC_TICKS start_tick, end_tick;
UINT64 lock_wait_time = 0;
if (perfmon_get_activation_flag () & PERFMON_ACTIVATION_FLAG_PB_HASH_ANCHOR)
{
if (perfmon_is_perf_tracking ())
{
tsc_getticks (&start_tick);
}
}
/* the caller is holding bufptr->mutex */
/* fcnt==0, next_wait_thrd==NULL, latch_mode==PGBUF_NO_LATCH */
/* if (bufptr->latch_mode==PGBUF_NO_LATCH) invoked by an invalidator */
hash_anchor = &(pgbuf_Pool.buf_hash_table[PGBUF_HASH_VALUE (&(bufptr->vpid))]);
rv = pthread_mutex_lock (&hash_anchor->hash_mutex);
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_HASH_ANCHOR))
{
tsc_getticks (&end_tick);
lock_wait_time = tsc_elapsed_utime (end_tick, start_tick);
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_HASH_ANCHOR_WAITS);
perfmon_add_stat (thread_p, PSTAT_PB_TIME_HASH_ANCHOR_WAIT, lock_wait_time);
}
if (pgbuf_bcb_is_flushing (bufptr))
{
assert (false);
/* Someone tries to fix the current buffer page. So, give up selecting current buffer page as a victim. */
pthread_mutex_unlock (&hash_anchor->hash_mutex);
set_latch (&bufptr->atomic_latch, PGBUF_NO_LATCH);
PGBUF_BCB_UNLOCK (bufptr);
return ER_FAILED;
}
else
{
/* find current BCB in buffer hash chain */
prev_bufptr = NULL;
curr_bufptr = hash_anchor->hash_next;
while (curr_bufptr != NULL)
{
if (curr_bufptr == bufptr)
{
break;
}
prev_bufptr = curr_bufptr;
curr_bufptr = curr_bufptr->hash_next;
}
if (curr_bufptr == NULL)
{
assert (false);
pthread_mutex_unlock (&hash_anchor->hash_mutex);
/* Now, the caller is holding bufptr->mutex. */
/* bufptr->mutex will be released in following function. */
pgbuf_put_bcb_into_invalid_list (thread_p, bufptr);
return ER_FAILED;
}
/* disconnect the BCB from the buffer hash chain */
if (prev_bufptr == NULL)
{
hash_anchor->hash_next = curr_bufptr->hash_next;
}
else
{
prev_bufptr->hash_next = curr_bufptr->hash_next;
}
curr_bufptr->hash_next = NULL;
pthread_mutex_unlock (&hash_anchor->hash_mutex);
VPID_SET_NULL (&(bufptr->vpid));
pgbuf_bcb_check_and_reset_fix_and_avoid_dealloc (bufptr, ARG_FILE_LINE);
return NO_ERROR;
}
}
/*
* pgbuf_lock_page () - Puts a buffer lock on the buffer lock chain
* return: If success, PGBUF_LOCK_HOLDER, otherwise PGBUF_LOCK_WAITER
* hash_anchor(in):
* vpid(in):
*
* Note: This function is invoked only when the page is not in the buffer hash
* chain. The caller is holding hash_anchor->hash_mutex.
* Before return, the thread releases hash_anchor->hash_mutex.
*/
static int
pgbuf_lock_page (THREAD_ENTRY * thread_p, PGBUF_BUFFER_HASH * hash_anchor, const VPID * vpid)
{
#if defined(SERVER_MODE)
PGBUF_BUFFER_LOCK *cur_buffer_lock;
THREAD_ENTRY *cur_thrd_entry;
TSC_TICKS start_tick, end_tick;
UINT64 lock_wait_time = 0;
/* the caller is holding hash_anchor->hash_mutex */
/* check whether the page is in the Buffer Lock Chain */
if (thread_p == NULL)
{
assert (thread_p != NULL);
thread_p = thread_get_thread_entry_info ();
}
cur_thrd_entry = thread_p;
cur_buffer_lock = hash_anchor->lock_next;
/* find vpid in buffer lock chain */
while (cur_buffer_lock != NULL)
{
if (VPID_EQ (&(cur_buffer_lock->vpid), vpid))
{
/* found */
cur_thrd_entry->next_wait_thrd = cur_buffer_lock->next_wait_thrd;
cur_buffer_lock->next_wait_thrd = cur_thrd_entry;
pgbuf_sleep (cur_thrd_entry, &hash_anchor->hash_mutex);
if (cur_thrd_entry->resume_status != THREAD_PGBUF_RESUMED)
{
/* interrupt operation */
THREAD_ENTRY *thrd_entry, *prev_thrd_entry = NULL;
int r;
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_HASH_ANCHOR))
{
tsc_getticks (&start_tick);
}
r = pthread_mutex_lock (&hash_anchor->hash_mutex);
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_HASH_ANCHOR))
{
tsc_getticks (&end_tick);
lock_wait_time = tsc_elapsed_utime (end_tick, start_tick);
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_HASH_ANCHOR_WAITS);
perfmon_add_stat (thread_p, PSTAT_PB_TIME_HASH_ANCHOR_WAIT, lock_wait_time);
}
thrd_entry = cur_buffer_lock->next_wait_thrd;
while (thrd_entry != NULL)
{
if (thrd_entry == cur_thrd_entry)
{
if (prev_thrd_entry == NULL)
{
cur_buffer_lock->next_wait_thrd = thrd_entry->next_wait_thrd;
}
else
{
prev_thrd_entry->next_wait_thrd = thrd_entry->next_wait_thrd;
}
thrd_entry->next_wait_thrd = NULL;
pthread_mutex_unlock (&hash_anchor->hash_mutex);
perfmon_inc_stat (thread_p, PSTAT_LK_NUM_WAITED_ON_PAGES); /* monitoring */
return PGBUF_LOCK_WAITER;
}
prev_thrd_entry = thrd_entry;
thrd_entry = thrd_entry->next_wait_thrd;
}
pthread_mutex_unlock (&hash_anchor->hash_mutex);
}
perfmon_inc_stat (thread_p, PSTAT_LK_NUM_WAITED_ON_PAGES); /* monitoring */
return PGBUF_LOCK_WAITER;
}
cur_buffer_lock = cur_buffer_lock->lock_next;
}
/* buf_lock_table is implemented to have one entry for each thread. At first design, it had one entry for each
* thread. cur_thrd_entry->index : thread entry index cur_thrd_entry->tran_index : transaction entry index */
/* vpid is not found in the Buffer Lock Chain */
cur_buffer_lock = &(pgbuf_Pool.buf_lock_table[cur_thrd_entry->index]);
cur_buffer_lock->vpid = *vpid;
cur_buffer_lock->next_wait_thrd = NULL;
cur_buffer_lock->lock_next = hash_anchor->lock_next;
hash_anchor->lock_next = cur_buffer_lock;
pthread_mutex_unlock (&hash_anchor->hash_mutex);
#endif /* SERVER_MODE */
perfmon_inc_stat (thread_p, PSTAT_LK_NUM_ACQUIRED_ON_PAGES); /* monitoring */
return PGBUF_LOCK_HOLDER;
}
/*
* pgbuf_unlock_page () - Deletes a buffer lock from the buffer lock chain
* return: NO_ERROR
* hash_anchor(in):
* vpid(in):
* need_hash_mutex(in):
*
* Note: This function is invoked only after the page is read into buffer and
* the BCB is connected into its corresponding buffer hash chain.
* Before return, the thread releases the hash mutex on the hash
* anchor and wakes up all the threads blocked on the queue of the
* buffer lock record.
*/
static int
pgbuf_unlock_page (THREAD_ENTRY * thread_p, PGBUF_BUFFER_HASH * hash_anchor, const VPID * vpid, int need_hash_mutex)
{
#if defined(SERVER_MODE)
int rv;
TSC_TICKS start_tick, end_tick;
UINT64 lock_wait_time = 0;
PGBUF_BUFFER_LOCK *prev_buffer_lock, *cur_buffer_lock;
THREAD_ENTRY *cur_thrd_entry;
if (need_hash_mutex)
{
if (perfmon_get_activation_flag () & PERFMON_ACTIVATION_FLAG_PB_HASH_ANCHOR)
{
if (perfmon_is_perf_tracking ())
{
tsc_getticks (&start_tick);
}
}
rv = pthread_mutex_lock (&hash_anchor->hash_mutex);
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_HASH_ANCHOR))
{
tsc_getticks (&end_tick);
lock_wait_time = tsc_elapsed_utime (end_tick, start_tick);
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_HASH_ANCHOR_WAITS);
perfmon_add_stat (thread_p, PSTAT_PB_TIME_HASH_ANCHOR_WAIT, lock_wait_time);
}
}
/* check whether the page is in the Buffer Lock Chain */
prev_buffer_lock = NULL;
cur_buffer_lock = hash_anchor->lock_next;
while (cur_buffer_lock != NULL)
{
if (VPID_EQ (&(cur_buffer_lock->vpid), vpid))
{
break;
}
prev_buffer_lock = cur_buffer_lock;
cur_buffer_lock = cur_buffer_lock->lock_next;
}
if (cur_buffer_lock != NULL)
{
if (prev_buffer_lock == NULL)
{
hash_anchor->lock_next = cur_buffer_lock->lock_next;
}
else
{
prev_buffer_lock->lock_next = cur_buffer_lock->lock_next;
}
cur_buffer_lock->lock_next = NULL;
pthread_mutex_unlock (&hash_anchor->hash_mutex);
while ((cur_thrd_entry = cur_buffer_lock->next_wait_thrd) != NULL)
{
cur_buffer_lock->next_wait_thrd = cur_thrd_entry->next_wait_thrd;
cur_thrd_entry->next_wait_thrd = NULL;
pgbuf_wakeup_uncond (cur_thrd_entry);
}
}
else
{
pthread_mutex_unlock (&hash_anchor->hash_mutex);
}
#endif /* SERVER_MODE */
return NO_ERROR;
}
/*
* pgbuf_allocate_bcb () - Allocates a BCB
* return: If success, a newly allocated BCB, otherwise NULL
* src_vpid(in):
*
* Note: This function allocates a BCB from the buffer invalid list or the LRU list.
* It is invoked only when a page is not in buffer.
*/
static PGBUF_BCB *
pgbuf_allocate_bcb (THREAD_ENTRY * thread_p, const VPID * src_vpid)
{
PGBUF_BCB *bufptr;
PERF_UTIME_TRACKER time_tracker_alloc_bcb = PERF_UTIME_TRACKER_INITIALIZER;
PERF_UTIME_TRACKER time_tracker_alloc_search_and_wait = PERF_UTIME_TRACKER_INITIALIZER;
bool detailed_perf = perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION);
int tran_index = LOG_FIND_THREAD_TRAN_INDEX (thread_p);
PGBUF_STATUS *show_status = &pgbuf_Pool.show_status[tran_index];
#if defined (SERVER_MODE)
struct timespec to;
int r = 0;
PERF_STAT_ID pstat_cond_wait;
bool high_priority = false;
#endif /* SERVER_MODE */
/* how it works: we need to free a bcb for new VPID.
* 1. first source should be invalid list. initially, all bcb's will be in this list. sometimes, bcb's can be added to
* this list during runtime. in any case, these bcb's are not used by anyone, do not need any flush or other
* actions and are the best option for allocating a bcb.
* 2. search the bcb in lru lists by calling pgbuf_get_victim.
* 3. if search failed then:
* SERVER_MODE: thread is added to one of two queues: high priority waiting threads queue or low priority waiting
* threads queue. high priority is usually populated by vacuum threads or by threads holding latch
* on very hot pages (b-tree roots, heap headers, volume header or file headers).
* thread will then be assigned a victim directly (there are multiple ways this can happen) and woken
* up.
* TODO: we have one big vulnerability with waiting threads. what if, for any reason, no one feeds the
* waiting thread with a victim. page flush thread may be sleeping and no one wakes it, and the
* activity may be so reduced that no adjustments are made to lists. thread ends up with
* timeout. right now, after we added the victim rich hack, this may not happen. we could
* consider a backup plan to generate victims for a forgotten waiter.
* SA_MODE: pages are flushed and victim is searched again (and we expect this time to find a victim).
*
* note: SA_MODE approach also applies to server-mode recovery (or in any circumstance which has page flush thread
* unavailable).
*/
/* allocate a BCB from invalid BCB list */
bufptr = pgbuf_get_bcb_from_invalid_list (thread_p);
if (bufptr != NULL)
{
return bufptr;
}
PERF_UTIME_TRACKER_START (thread_p, &time_tracker_alloc_bcb);
if (detailed_perf)
{
PERF_UTIME_TRACKER_START (thread_p, &time_tracker_alloc_search_and_wait);
}
/* search lru lists */
bufptr = pgbuf_get_victim (thread_p);
PERF_UTIME_TRACKER_TIME_AND_RESTART (thread_p, &time_tracker_alloc_search_and_wait, PSTAT_PB_ALLOC_BCB_SEARCH_VICTIM);
if (bufptr != NULL)
{
goto end;
}
#if defined (SERVER_MODE)
if (pgbuf_is_page_flush_daemon_available ())
{
retry:
high_priority = high_priority || VACUUM_IS_THREAD_VACUUM (thread_p) || pgbuf_is_thread_high_priority (thread_p);
/* add to waiters thread list to be assigned victim directly */
to.tv_sec = (int) time (NULL) + pgbuf_latch_timeout;
to.tv_nsec = 0;
thread_lock_entry (thread_p);
assert (pgbuf_Pool.direct_victims.bcb_victims[thread_p->index] == NULL);
/* push to waiter thread list */
if (high_priority)
{
if (detailed_perf && VACUUM_IS_THREAD_VACUUM (thread_p))
{
perfmon_inc_stat (thread_p, PSTAT_PB_ALLOC_BCB_PRIORITIZE_VACUUM);
}
if (!pgbuf_Pool.direct_victims.waiter_threads_high_priority->produce (thread_p))
{
assert (false);
thread_unlock_entry (thread_p);
return NULL;
}
pstat_cond_wait = PSTAT_PB_ALLOC_BCB_COND_WAIT_HIGH_PRIO;
}
else
{
if (!pgbuf_Pool.direct_victims.waiter_threads_low_priority->produce (thread_p))
{
/* ok, we have this very weird case when a consumer can be preempted for a very long time (which prevents
* producers from being able to push to queue). I don't know how is this even possible, I just know I
* found a case. I cannot tell exactly how long the consumer is preempted, but I know the time difference
* between the producer still waiting to be waken by that consumer and the producer failing to add was 93
* milliseconds. Which is huge if you ask me.
* I doubled the size of the queue, but theoretically, this is still possible. I also removed the
* ABORT_RELEASE, but we may have to think of a way to handle this preempted consumer case. */
/* we do a hack for this case. we add the thread to high-priority instead, which is usually less used and
* the same case is (almost) impossible to happen. */
if (!pgbuf_Pool.direct_victims.waiter_threads_high_priority->produce (thread_p))
{
assert (false);
thread_unlock_entry (thread_p);
goto end;
}
pstat_cond_wait = PSTAT_PB_ALLOC_BCB_COND_WAIT_HIGH_PRIO;
}
else
{
pstat_cond_wait = PSTAT_PB_ALLOC_BCB_COND_WAIT_LOW_PRIO;
}
}
/* make sure at least flush will feed us with bcb's. */
// before migration of the page_flush_daemon it was a try_wakeup, check if still needed
pgbuf_wakeup_page_flush_daemon (thread_p);
show_status->num_flusher_waiting_threads++;
r = thread_suspend_timeout_wakeup_and_unlock_entry (thread_p, &to, THREAD_ALLOC_BCB_SUSPENDED);
show_status->num_flusher_waiting_threads--;
PERF_UTIME_TRACKER_TIME (thread_p, &time_tracker_alloc_search_and_wait, pstat_cond_wait);
if (r == NO_ERROR)
{
if (thread_p->resume_status == THREAD_ALLOC_BCB_RESUMED)
{
bufptr = pgbuf_get_direct_victim (thread_p);
if (bufptr == NULL)
{
/* bcb was fixed again */
high_priority = true;
goto retry;
}
goto end;
}
/* no bcb should be allocated. */
/* interrupted */
assert (thread_p->resume_status == THREAD_RESUME_DUE_TO_INTERRUPT
|| thread_p->resume_status == THREAD_RESUME_DUE_TO_SHUTDOWN);
if (pgbuf_Pool.direct_victims.bcb_victims[thread_p->index] != NULL)
{
/* a bcb was assigned before being interrupted. it must be "unassigned" */
pgbuf_bcb_update_flags (thread_p, pgbuf_Pool.direct_victims.bcb_victims[thread_p->index], 0,
PGBUF_BCB_VICTIM_DIRECT_FLAG | PGBUF_BCB_INVALIDATE_DIRECT_VICTIM_FLAG);
pgbuf_Pool.direct_victims.bcb_victims[thread_p->index] = NULL;
}
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_INTERRUPTED, 0);
}
else
{
/* should not timeout! */
assert (r != ER_CSS_PTHREAD_COND_TIMEDOUT);
thread_p->resume_status = THREAD_ALLOC_BCB_RESUMED;
thread_unlock_entry (thread_p);
if (r == ER_CSS_PTHREAD_COND_TIMEDOUT)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_CSS_PTHREAD_COND_TIMEDOUT, 0);
}
}
}
#endif /* SERVER_MODE */
else
{
/* flush */
pgbuf_wakeup_page_flush_daemon (thread_p);
/* search lru lists again */
bufptr = pgbuf_get_victim (thread_p);
PERF_UTIME_TRACKER_TIME (thread_p, &time_tracker_alloc_search_and_wait, PSTAT_PB_ALLOC_BCB_SEARCH_VICTIM);
assert (bufptr != NULL);
}
end:
if (bufptr != NULL)
{
/* victimize the buffer */
if (pgbuf_victimize_bcb (thread_p, bufptr) != NO_ERROR)
{
assert (false);
bufptr = NULL;
}
}
else
{
if (er_errid () == NO_ERROR)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_PB_ALL_BUFFERS_DIRTY, 1, 0);
}
}
PERF_UTIME_TRACKER_TIME (thread_p, &time_tracker_alloc_bcb, PSTAT_PB_ALLOC_BCB);
return bufptr;
}
/*
* pgbuf_claim_bcb_for_fix () - function used for page fix to claim a bcb when page is not found in buffer
*
* return : claimed BCB
* thread_p (in) : thread entry
* vpid (in) : page identifier
* fetch_mode (in) : fetch mode
* hash_anchor (in/out) : hash anchor
* perf (in/out) : page fix performance monitoring helper
* try_again (out) : output true to trying getting bcb again
*/
static PGBUF_BCB *
pgbuf_claim_bcb_for_fix (THREAD_ENTRY * thread_p, const VPID * vpid, PAGE_FETCH_MODE fetch_mode,
PGBUF_BUFFER_HASH * hash_anchor, PGBUF_FIX_PERF * perf, bool * try_again, bool already_locked)
{
PGBUF_BCB *bufptr = NULL;
PAGE_PTR pgptr = NULL;
TDE_ALGORITHM tde_algo = TDE_ALGORITHM_NONE;
bool success;
int tran_index = LOG_FIND_THREAD_TRAN_INDEX (thread_p);
PGBUF_STATUS *show_status = &pgbuf_Pool.show_status[tran_index];
PGBUF_ATOMIC_LATCH_IMPL impl;
#if defined (ENABLE_SYSTEMTAP)
bool monitored = false;
QUERY_ID query_id = NULL_QUERY_ID;
#endif /* ENABLE_SYSTEMTAP */
assert (fetch_mode != OLD_PAGE_IF_IN_BUFFER);
/* The page is not found in the hash chain the caller is holding hash_anchor->hash_mutex */
if (er_errid () == ER_CSS_PTHREAD_MUTEX_TRYLOCK)
{
pthread_mutex_unlock (&hash_anchor->hash_mutex);
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
return NULL;
}
/* In this case, the caller is holding only hash_anchor->hash_mutex. The hash_anchor->hash_mutex is to be
* released in pgbuf_lock_page (). */
if (!already_locked && pgbuf_lock_page (thread_p, hash_anchor, vpid) != PGBUF_LOCK_HOLDER)
{
if (perf)
{
if (perf->is_perf_tracking)
{
tsc_getticks (&perf->end_tick);
tsc_elapsed_time_usec (&perf->tv_diff, perf->end_tick, perf->start_tick);
perf->lock_wait_time = perf->tv_diff.tv_sec * 1000000LL + perf->tv_diff.tv_usec;
}
if (fetch_mode == NEW_PAGE)
{
perf->perf_page_found = PERF_PAGE_MODE_NEW_LOCK_WAIT;
}
else
{
perf->perf_page_found = PERF_PAGE_MODE_OLD_LOCK_WAIT;
}
}
*try_again = true;
return NULL;
}
if (perf)
{
if (perf->perf_page_found != PERF_PAGE_MODE_NEW_LOCK_WAIT
&& perf->perf_page_found != PERF_PAGE_MODE_OLD_LOCK_WAIT)
{
if (fetch_mode == NEW_PAGE)
{
perf->perf_page_found = PERF_PAGE_MODE_NEW_NO_WAIT;
}
else
{
perf->perf_page_found = PERF_PAGE_MODE_OLD_NO_WAIT;
}
}
}
/* Now, the caller is not holding any mutex. */
bufptr = pgbuf_allocate_bcb (thread_p, vpid);
if (bufptr == NULL)
{
ASSERT_ERROR ();
(void) pgbuf_unlock_page (thread_p, hash_anchor, vpid, true);
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
return NULL;
}
/* Currently, caller has one allocated BCB and is holding mutex */
/* initialize the BCB */
bufptr->vpid = *vpid;
assert (!pgbuf_bcb_avoid_victim (bufptr));
impl = get_impl (&bufptr->atomic_latch);
impl.impl.latch_mode = PGBUF_NO_LATCH;
impl.impl.waiter_exists = false;
impl.impl.fcnt = 0;
bufptr->atomic_latch.store (impl.raw);
pgbuf_bcb_update_flags (thread_p, bufptr, 0, PGBUF_BCB_ASYNC_FLUSH_REQ); /* todo: why this?? */
pgbuf_bcb_check_and_reset_fix_and_avoid_dealloc (bufptr, ARG_FILE_LINE);
LSA_SET_NULL (&bufptr->oldest_unflush_lsa);
if (fetch_mode != NEW_PAGE)
{
/* Record number of reads in statistics */
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_IOREADS);
show_status->num_pages_read++;
#if defined(ENABLE_SYSTEMTAP)
query_id = qmgr_get_current_query_id (thread_p);
if (query_id != NULL_QUERY_ID)
{
monitored = true;
CUBRID_IO_READ_START (query_id);
}
#endif /* ENABLE_SYSTEMTAP */
if (dwb_read_page (thread_p, vpid, &bufptr->iopage_buffer->iopage, &success) != NO_ERROR)
{
/* Should not happen */
assert (false);
return NULL;
}
else if (success == true)
{
/* Nothing to do, copied from DWB */
}
else if (fileio_read (thread_p, fileio_get_volume_descriptor (vpid->volid), &bufptr->iopage_buffer->iopage,
vpid->pageid, IO_PAGESIZE) == NULL)
{
/* There was an error in reading the page. Clean the buffer... since it may have been corrupted */
ASSERT_ERROR ();
/* bufptr->mutex will be released in following function. */
pgbuf_put_bcb_into_invalid_list (thread_p, bufptr);
/*
* Now, caller is not holding any mutex.
* the last argument of pgbuf_unlock_page () is true that
* means hash_mutex must be held before unlocking page.
*/
(void) pgbuf_unlock_page (thread_p, hash_anchor, vpid, true);
#if defined(ENABLE_SYSTEMTAP)
if (monitored == true)
{
CUBRID_IO_READ_END (query_id, IO_PAGESIZE, 1);
}
#endif /* ENABLE_SYSTEMTAP */
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
return NULL;
}
CAST_IOPGPTR_TO_PGPTR (pgptr, &bufptr->iopage_buffer->iopage);
tde_algo = pgbuf_get_tde_algorithm (pgptr);
if (tde_algo != TDE_ALGORITHM_NONE)
{
if (tde_decrypt_data_page
(&bufptr->iopage_buffer->iopage, tde_algo, pgbuf_is_temporary_volume (vpid->volid),
&bufptr->iopage_buffer->iopage) != NO_ERROR)
{
ASSERT_ERROR ();
pgbuf_put_bcb_into_invalid_list (thread_p, bufptr);
(void) pgbuf_unlock_page (thread_p, hash_anchor, vpid, true);
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
return NULL;
}
}
#if defined(ENABLE_SYSTEMTAP)
if (monitored == true)
{
CUBRID_IO_READ_END (query_id, IO_PAGESIZE, 0);
}
#endif /* ENABLE_SYSTEMTAP */
if (pgbuf_is_temporary_volume (vpid->volid) == true)
{
/* Check if the first time to access */
if (!pgbuf_is_temp_lsa (bufptr->iopage_buffer->iopage.prv.lsa))
{
pgbuf_init_temp_page_lsa (&bufptr->iopage_buffer->iopage, IO_PAGESIZE);
pgbuf_set_dirty_buffer_ptr (thread_p, bufptr);
}
}
#if !defined (NDEBUG)
/* perm volume */
if (bufptr->vpid.volid > NULL_VOLID)
{
if (!log_is_in_crash_recovery ())
{
if (!LSA_ISNULL (&bufptr->iopage_buffer->iopage.prv.lsa))
{
assert (bufptr->iopage_buffer->iopage.prv.pageid != -1);
assert (bufptr->iopage_buffer->iopage.prv.volid != -1);
}
}
}
#endif /* NDEBUG */
if (thread_get_sort_stats_active (thread_p))
{
perfmon_inc_stat (thread_p, PSTAT_SORT_NUM_IO_PAGES);
}
}
else
{
/* the caller is holding bufptr->mutex */
#if defined(CUBRID_DEBUG)
pgbuf_scramble (&bufptr->iopage_buffer->iopage);
#endif /* CUBRID_DEBUG */
/* Don't need to read page from disk since it is a new page. */
if (pgbuf_is_temporary_volume (vpid->volid) == true)
{
pgbuf_init_temp_page_lsa (&bufptr->iopage_buffer->iopage, IO_PAGESIZE);
}
else
{
fileio_init_lsa_of_page (&bufptr->iopage_buffer->iopage, IO_PAGESIZE);
}
/* perm volume */
if (bufptr->vpid.volid > NULL_VOLID)
{
/* Init Page identifier of NEW_PAGE */
bufptr->iopage_buffer->iopage.prv.pageid = -1;
bufptr->iopage_buffer->iopage.prv.volid = -1;
}
if (thread_get_sort_stats_active (thread_p))
{
perfmon_inc_stat (thread_p, PSTAT_SORT_NUM_DATA_PAGES);
}
show_status->num_pages_created++;
show_status->num_hit++;
}
return bufptr;
}
/*
* pgbuf_victimize_bcb () - Victimize given buffer page
* return: NO_ERROR, or ER_code
* bufptr(in): pointer to buffer page
*/
static int
pgbuf_victimize_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
{
PGBUF_ATOMIC_LATCH_IMPL impl;
#if defined(SERVER_MODE)
if (thread_p == NULL)
{
assert (thread_p != NULL);
thread_p = thread_get_thread_entry_info ();
}
#endif /* SERVER_MODE */
/* the caller is holding bufptr->mutex */
/* before-flush, check victim condition again */
if (!pgbuf_is_bcb_victimizable (bufptr, true))
{
assert (false);
PGBUF_BCB_UNLOCK (bufptr);
return ER_FAILED;
}
if (pgbuf_bcb_is_to_vacuum (bufptr))
{
pgbuf_bcb_update_flags (thread_p, bufptr, 0, PGBUF_BCB_TO_VACUUM_FLAG);
}
assert (get_latch (&bufptr->atomic_latch) == PGBUF_NO_LATCH);
/* a safe victim */
if (pgbuf_delete_from_hash_chain (thread_p, bufptr) != NO_ERROR)
{
return ER_FAILED;
}
impl = get_impl (&bufptr->atomic_latch);
impl.impl.latch_mode = PGBUF_LATCH_INVALID;
impl.impl.waiter_exists = false;
impl.impl.fcnt = 0;
bufptr->atomic_latch.store (impl.raw);
/* If above function returns success, the caller is still holding bufptr->mutex.
* Otherwise, the caller does not hold bufptr->mutex.
*/
/* at this point, the caller is holding bufptr->mutex */
return NO_ERROR;
}
/*
* pgbuf_invalidate_bcb () - Invalidates BCB
* return: NO_ERROR, or ER_code
* bufptr(in): pointer to buffer page
*/
static int
pgbuf_invalidate_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
{
/* the caller is holding bufptr->mutex */
/* be sure that there is not any reader/writer */
if (get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_INVALID)
{
PGBUF_BCB_UNLOCK (bufptr);
return NO_ERROR;
}
if (pgbuf_bcb_is_direct_victim (bufptr))
{
/* bcb is already assigned as direct victim, should be victimized soon, so there is no point in invalidating it
* here */
PGBUF_BCB_UNLOCK (bufptr);
return NO_ERROR;
}
pgbuf_bcb_clear_dirty (thread_p, bufptr);
LSA_SET_NULL (&bufptr->oldest_unflush_lsa);
/* bufptr->mutex is still held by the caller. */
switch (pgbuf_bcb_get_zone (bufptr))
{
case PGBUF_VOID_ZONE:
break;
default:
assert (PGBUF_IS_BCB_IN_LRU (bufptr));
pgbuf_lru_remove_bcb (thread_p, bufptr);
break;
}
if (get_latch (&bufptr->atomic_latch) == PGBUF_NO_LATCH)
{
if (pgbuf_delete_from_hash_chain (thread_p, bufptr) != NO_ERROR)
{
return ER_FAILED;
}
/* If above function returns failure, the caller does not hold bufptr->mutex. Otherwise, the caller is
* holding bufptr->mutex. */
/* Now, the caller is holding bufptr->mutex. */
/* bufptr->mutex will be released in following function. */
pgbuf_put_bcb_into_invalid_list (thread_p, bufptr);
}
else
{
/* todo: what to do? */
assert (false);
set_latch (&bufptr->atomic_latch, PGBUF_NO_LATCH);
PGBUF_BCB_UNLOCK (bufptr);
}
return NO_ERROR;
}
/*
* pgbuf_bcb_safe_flush_force_unlock () - safe-flush bcb and make sure it does not remain locked.
*
* return : error code
* thread_p (in) : thread entry
* bufptr (in) : bcb to flush
* synchronous (in) : true if caller wants to wait for bcb to be flushed (if it cannot flush immediately it gets
* blocked). if false, the caller will only request flush and continue.
*/
static int
pgbuf_bcb_safe_flush_force_unlock (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, bool synchronous)
{
int error_code = NO_ERROR;
bool locked = true;
error_code = pgbuf_bcb_safe_flush_internal (thread_p, bufptr, synchronous, &locked);
if (locked)
{
PGBUF_BCB_UNLOCK (bufptr);
}
return error_code;
}
/*
* pgbuf_bcb_safe_flush_force_lock () - safe-flush bcb and make sure it remains locked.
*
* return : error code
* thread_p (in) : thread entry
* bufptr (in) : bcb to flush
* synchronous (in) : true if caller wants to wait for bcb to be flushed (if it cannot flush immediately it gets
* blocked). if false, the caller will only request flush and continue.
*/
static int
pgbuf_bcb_safe_flush_force_lock (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, bool synchronous)
{
int error_code = NO_ERROR;
bool locked = true;
error_code = pgbuf_bcb_safe_flush_internal (thread_p, bufptr, synchronous, &locked);
if (error_code != NO_ERROR)
{
if (locked)
{
PGBUF_BCB_UNLOCK (bufptr);
}
return error_code;
}
if (!locked)
{
PGBUF_BCB_LOCK (bufptr);
}
return NO_ERROR;
}
/*
* pgbuf_bcb_safe_flush_internal () - safe-flush bcb. function will do all the necessary checks. flush is executed only
* bcb is dirty. function is safe in regard with concurrent latches and flushes.
*
* return : error code
* thread_p (in) : thread entry
* bufptr (in) : bcb to flush
* synchronous (in) : true if caller wants to wait for bcb to be flushed (if it cannot flush immediately it gets
* blocked). if false, the caller will only request flush and continue.
* locked (out) : output if bcb is locked.
*/
static int
pgbuf_bcb_safe_flush_internal (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, bool synchronous, bool * locked)
{
int error_code = NO_ERROR;
PGBUF_ATOMIC_LATCH_IMPL impl, impl_new;
bool immediate_flush = false, block = false, is_flushing = false;
assert (get_latch (&bufptr->atomic_latch) != PGBUF_LATCH_FLUSH);
PGBUF_BCB_CHECK_OWN (bufptr);
*locked = true;
/* the caller is holding bufptr->mutex */
if (!pgbuf_bcb_is_dirty (bufptr))
{
/* not dirty; flush is not required */
return NO_ERROR;
}
/* there are two cases when we cannot flush immediately:
* 1. page is write latched. we cannot know when the latcher makes modifications, so it is not safe to flush the page.
* 2. another thread is already flushing. allowing multiple concurrent flushes is not safe (we cannot guarantee the
* order of disk writing, therefore it is theoretically possible to write an old version over a newer version of
* the page).
*
* for the first case, we use the PGBUF_BCB_ASYNC_FLUSH_REQ flag to request a flush from the thread holding latch.
* for the second case, we know the bcb is already being flushed. if we need to be sure page is flushed, we'll put
* ourselves in bcb's waiting list (and a thread doing flush should wake us).
*/
do
{
immediate_flush = false;
block = false;
is_flushing = false;
impl = get_impl (&bufptr->atomic_latch);
impl_new = impl;
is_flushing = pgbuf_bcb_is_flushing (bufptr);
if (!is_flushing
&& (impl.impl.latch_mode == PGBUF_NO_LATCH || impl.impl.latch_mode == PGBUF_LATCH_READ
|| (impl.impl.latch_mode == PGBUF_LATCH_WRITE && pgbuf_find_thrd_holder (thread_p, bufptr) != NULL)))
{
immediate_flush = true;
}
else
{
assert (is_flushing || impl.impl.latch_mode == PGBUF_LATCH_WRITE);
if (synchronous)
{
block = true;
impl_new.impl.waiter_exists = true;
}
}
}
while (!bufptr->atomic_latch.compare_exchange_strong (impl.raw, impl_new.raw, std::memory_order_acq_rel,
std::memory_order_acquire));
if (immediate_flush)
{
/* don't have to wait for writer/flush */
return pgbuf_bcb_flush_with_wal (thread_p, bufptr, false, locked);
}
/* page is write latched. notify the holder to flush it on unfix. */
if (!is_flushing)
{
pgbuf_bcb_update_flags (thread_p, bufptr, PGBUF_BCB_ASYNC_FLUSH_REQ, 0);
}
if (block)
{
/* wait for bcb to be flushed. */
*locked = false;
error_code = pgbuf_block_bcb (thread_p, bufptr, PGBUF_LATCH_FLUSH, 0, false);
if (error_code != NO_ERROR)
{
ASSERT_ERROR ();
}
return error_code;
}
/* don't wait for flush */
return NO_ERROR;
}
/*
* pgbuf_get_bcb_from_invalid_list () - Get BCB from buffer invalid list
*
* return: If success, a newly allocated BCB, otherwise NULL
* thread_p (in) : thread entry
*
* Note: This function disconnects a BCB on the top of the buffer invalid list
* and returns it. Before disconnection, the thread must hold the
* invalid list mutex and after disconnection, release the mutex.
*/
static PGBUF_BCB *
pgbuf_get_bcb_from_invalid_list (THREAD_ENTRY * thread_p)
{
PGBUF_BCB *bufptr;
#if defined(SERVER_MODE)
int rv;
#endif /* SERVER_MODE */
/* check if invalid BCB list is empty (step 1) */
if (pgbuf_Pool.buf_invalid_list.invalid_top == NULL)
{
return NULL;
}
rv = pthread_mutex_lock (&pgbuf_Pool.buf_invalid_list.invalid_mutex);
/* check if invalid BCB list is empty (step 2) */
if (pgbuf_Pool.buf_invalid_list.invalid_top == NULL)
{
/* invalid BCB list is empty */
pthread_mutex_unlock (&pgbuf_Pool.buf_invalid_list.invalid_mutex);
return NULL;
}
else
{
/* invalid BCB list is not empty */
bufptr = pgbuf_Pool.buf_invalid_list.invalid_top;
pgbuf_Pool.buf_invalid_list.invalid_top = bufptr->next_BCB;
pgbuf_Pool.buf_invalid_list.invalid_cnt -= 1;
pthread_mutex_unlock (&pgbuf_Pool.buf_invalid_list.invalid_mutex);
PGBUF_BCB_LOCK (bufptr);
bufptr->next_BCB = NULL;
pgbuf_bcb_change_zone (thread_p, bufptr, 0, PGBUF_VOID_ZONE);
perfmon_inc_stat (thread_p, PSTAT_PB_VICTIM_USE_INVALID_BCB);
return bufptr;
}
}
/*
* pgbuf_put_bcb_into_invalid_list () - Put BCB into buffer invalid list
* return: NO_ERROR
* bufptr(in):
*
* Note: This function connects BCB to the top of the buffer invalid list and
* makes its zone PB_INVALIDZone. Before connection, must hold the
* invalid list mutex and after connection, release the mutex.
*/
static int
pgbuf_put_bcb_into_invalid_list (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
{
#if defined(SERVER_MODE)
int rv;
#endif /* SERVER_MODE */
/* the caller is holding bufptr->mutex */
VPID_SET_NULL (&bufptr->vpid);
set_latch (&bufptr->atomic_latch, PGBUF_LATCH_INVALID);
assert ((bufptr->flags & PGBUF_BCB_FLAGS_MASK) == 0);
pgbuf_bcb_change_zone (thread_p, bufptr, 0, PGBUF_INVALID_ZONE);
pgbuf_bcb_check_and_reset_fix_and_avoid_dealloc (bufptr, ARG_FILE_LINE);
rv = pthread_mutex_lock (&pgbuf_Pool.buf_invalid_list.invalid_mutex);
bufptr->next_BCB = pgbuf_Pool.buf_invalid_list.invalid_top;
pgbuf_Pool.buf_invalid_list.invalid_top = bufptr;
pgbuf_Pool.buf_invalid_list.invalid_cnt += 1;
PGBUF_BCB_UNLOCK (bufptr);
pthread_mutex_unlock (&pgbuf_Pool.buf_invalid_list.invalid_mutex);
return NO_ERROR;
}
/*
* pgbuf_get_shared_lru_index_for_add () - get a shared index to add a new bcb. we'll use a round-robin way to choose
* next list, but we'll avoid biggest list (just to keep things balanced).
*
* return : shared lru index
*/
STATIC_INLINE int
pgbuf_get_shared_lru_index_for_add (void)
{
#define PAGE_ADD_REFRESH_STAT \
MAX (2 * pgbuf_Pool.num_buffers / PGBUF_SHARED_LRU_COUNT, 10000)
int i;
unsigned int lru_idx, refresh_stat_cnt;
lru_idx = ATOMIC_INC_32 (&pgbuf_Pool.quota.add_shared_lru_idx, 1);
refresh_stat_cnt = lru_idx % PAGE_ADD_REFRESH_STAT;
/* check if there is an in-balance BCBs distribution across shared LRUs */
if (refresh_stat_cnt == 0)
{
int shared_lru_bcb_sum;
int max_bcb, min_bcb;
int lru_idx_with_max;
int this_lru_cnt;
int curr_avoid_lru_idx;
shared_lru_bcb_sum = 0;
max_bcb = 0;
min_bcb = pgbuf_Pool.num_buffers;
lru_idx_with_max = -1;
/* update unbalanced LRU idx */
for (i = 0; i < PGBUF_SHARED_LRU_COUNT; i++)
{
this_lru_cnt = PGBUF_LRU_LIST_COUNT (PGBUF_GET_LRU_LIST (i));
shared_lru_bcb_sum += this_lru_cnt;
if (this_lru_cnt > max_bcb)
{
max_bcb = this_lru_cnt;
lru_idx_with_max = i;
}
if (this_lru_cnt < min_bcb)
{
min_bcb = this_lru_cnt;
}
}
if (shared_lru_bcb_sum > pgbuf_Pool.num_buffers / 10
&& (max_bcb > (int) (1.3f * shared_lru_bcb_sum) / PGBUF_SHARED_LRU_COUNT || max_bcb > 2 * min_bcb))
{
ATOMIC_TAS_32 (&pgbuf_Pool.quota.avoid_shared_lru_idx, lru_idx_with_max);
}
else
{
curr_avoid_lru_idx = pgbuf_Pool.quota.avoid_shared_lru_idx;
if (curr_avoid_lru_idx == -1
|| (PGBUF_LRU_LIST_COUNT (PGBUF_GET_LRU_LIST (curr_avoid_lru_idx))
< shared_lru_bcb_sum / PGBUF_SHARED_LRU_COUNT))
{
ATOMIC_TAS_32 (&pgbuf_Pool.quota.avoid_shared_lru_idx, -1);
}
}
}
lru_idx = lru_idx % PGBUF_SHARED_LRU_COUNT;
/* avoid to add in shared LRU idx having too many BCBs */
if (pgbuf_Pool.quota.avoid_shared_lru_idx == (int) lru_idx)
{
lru_idx = ATOMIC_INC_32 (&pgbuf_Pool.quota.add_shared_lru_idx, 1);
lru_idx = lru_idx % PGBUF_SHARED_LRU_COUNT;
}
return lru_idx;
#undef PAGE_ADD_REFRESH_STAT
}
/*
* pgbuf_get_victim () - get a victim bcb from page buffer.
*
* return : victim candidate or NULL if no candidate was found
* thread_p (in) : thread entry
*
* Note: If a victim BCB is found, this function will already lock it. This means that the caller will have exclusive
* access to the returned BCB.
*/
static PGBUF_BCB *
pgbuf_get_victim (THREAD_ENTRY * thread_p)
{
#define PERF(id) if (detailed_perf) perfmon_inc_stat (thread_p, id)
PGBUF_BCB *victim = NULL;
bool detailed_perf = perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION);
bool has_flush_thread = pgbuf_is_page_flush_daemon_available ();
int nloops = 0; /* used as safe-guard against infinite loops */
int private_lru_idx;
PGBUF_LRU_LIST *lru_list = NULL;
bool restrict_other = false;
bool searched_own = false;
UINT64 initial_consume_cursor, current_consume_cursor;
PERF_UTIME_TRACKER perf_tracker = PERF_UTIME_TRACKER_INITIALIZER;
ATOMIC_INC_32 (&pgbuf_Pool.monitor.lru_victim_req_cnt, 1);
/* how this works:
* we need to find a victim in one of all lru lists. we have two lru list types: private and shared. private are pages
* fixed by a single transaction, while shared are pages fix by multiple transactions. we usually prioritize the
* private lists.
* the order we look for victimize is this:
* 1. first search in own private list if it is not under quota.
* 2. look in another private list.
* 3. look in a shared list.
*
* normally, if the system does not lack victims, one of the three searches should provide a victim candidate.
* however, we can be unlucky and not find a candidate with the three steps. this is especially possible when we have
* only one active transaction, with long transactions, and many vacuum workers trying to catch up. all candidates
* are found in a single private list, which means that many vacuum workers may not find the lists in lru queue.
* for this case, we loop the three searches, as long as pgbuf_Pool.monitor.victim_rich is true.
*
* note: if quota is disabled (although this is not recommended), only shared lists are searched.
*
* note: if all above failed to produce a victim, we'll try to victimize from own private even if it is under quota.
* we found a strange particular case when all private lists were on par with their quota's (but just below),
* shared lists had no lru 3 zone and nothing could be victimized or flushed.
*/
/* 1. search own private list */
if (PGBUF_THREAD_HAS_PRIVATE_LRU (thread_p))
{
/* first try my own private list */
private_lru_idx = PGBUF_LRU_INDEX_FROM_PRIVATE (PGBUF_PRIVATE_LRU_FROM_THREAD (thread_p));
lru_list = PGBUF_GET_LRU_LIST (private_lru_idx);
/* don't victimize from own list if it is under quota */
if (PGBUF_LRU_LIST_IS_ONE_TWO_OVER_QUOTA (lru_list)
|| (PGBUF_LRU_LIST_IS_OVER_QUOTA (lru_list) && lru_list->count_vict_cand > 0))
{
if (detailed_perf)
{
PERF_UTIME_TRACKER_START (thread_p, &perf_tracker);
}
victim = pgbuf_get_victim_from_lru_list (thread_p, private_lru_idx);
if (victim != NULL)
{
PERF (PSTAT_PB_OWN_VICTIM_PRIVATE_LRU_SUCCESS);
if (detailed_perf)
{
PERF_UTIME_TRACKER_TIME (thread_p, &perf_tracker, PSTAT_PB_VICTIM_SEARCH_OWN_PRIVATE_LISTS);
}
return victim;
}
/* failed */
PERF (PSTAT_PB_VICTIM_OWN_PRIVATE_LRU_FAIL);
if (detailed_perf)
{
PERF_UTIME_TRACKER_TIME (thread_p, &perf_tracker, PSTAT_PB_VICTIM_SEARCH_OWN_PRIVATE_LISTS);
}
/* if over quota, we are not allowed to search in other lru lists. we'll wait for victim.
* note: except vacuum threads who ignore unfixes and have no quota. */
if (!PGBUF_VACUUM_SHOULD_IGNORE_UNFIX (thread_p))
{
/* still, offer a chance to those that are just slightly over quota. this actually targets new
* transactions that do not have a quota yet... let them get a few bcb's first until their activity
* becomes relevant. */
restrict_other = PGBUF_LRU_LIST_IS_OVER_QUOTA_WITH_BUFFER (lru_list);
}
searched_own = true;
}
}
/* 2. search other private list.
*
* note: in single-thread context, the only list is mine. no point in trying to victimize again
* note: if restrict_other is true, only other big private lists can be used for victimization
*/
if (PGBUF_PAGE_QUOTA_IS_ENABLED && has_flush_thread)
{
if (detailed_perf)
{
PERF_UTIME_TRACKER_START (thread_p, &perf_tracker);
}
victim = pgbuf_lfcq_get_victim_from_private_lru (thread_p, restrict_other);
if (victim != NULL)
{
if (detailed_perf)
{
PERF_UTIME_TRACKER_TIME (thread_p, &perf_tracker, PSTAT_PB_VICTIM_SEARCH_OTHERS_PRIVATE_LISTS);
}
return victim;
}
if (detailed_perf)
{
PERF_UTIME_TRACKER_TIME (thread_p, &perf_tracker, PSTAT_PB_VICTIM_SEARCH_OTHERS_PRIVATE_LISTS);
}
}
/* loop:
*
* DOESN'T HAVE FLUSH THREAD: one iteration could fail, because the shared list's last victims have been set dirty.
* however, if there are other lists having victims, we should find them.
* it is possible to not have any victims, in which case the shared list queue should become empty. we'll have to do a
* flush and search again.
* we'd like to avoid looping infinitely (if there's a bug), so we use the nloops safe-guard. Each shared list should
* be removed after a failed search, so the maximum accepted number of loops is pgbuf_Pool.num_LRU_list.
*/
if (detailed_perf)
{
PERF_UTIME_TRACKER_START (thread_p, &perf_tracker);
}
initial_consume_cursor = pgbuf_Pool.shared_lrus_with_victims->get_consumer_cursor ();
do
{
/* 3. search a shared list. */
victim = pgbuf_lfcq_get_victim_from_shared_lru (thread_p, has_flush_thread);
if (victim != NULL)
{
if (detailed_perf)
{
PERF_UTIME_TRACKER_TIME (thread_p, &perf_tracker, PSTAT_PB_VICTIM_SEARCH_SHARED_LISTS);
}
return victim;
}
current_consume_cursor = pgbuf_Pool.shared_lrus_with_victims->get_consumer_cursor ();
}
while (!has_flush_thread && !pgbuf_Pool.shared_lrus_with_victims->is_empty ()
&& ((int) (current_consume_cursor - initial_consume_cursor) <= pgbuf_Pool.num_LRU_list)
&& (++nloops <= pgbuf_Pool.num_LRU_list));
/* todo: maybe we can find a less complicated condition of looping. Probably no need to use nloops <= pgbuf_Pool.num_LRU_list. */
if (detailed_perf)
{
PERF_UTIME_TRACKER_TIME (thread_p, &perf_tracker, PSTAT_PB_VICTIM_SEARCH_SHARED_LISTS);
}
/* no victim found... */
assert (victim == NULL);
PERF (PSTAT_PB_VICTIM_ALL_LRU_FAIL);
if (PGBUF_THREAD_HAS_PRIVATE_LRU (thread_p) && !searched_own)
{
/* try on own private even if it is under quota. */
private_lru_idx = PGBUF_LRU_INDEX_FROM_PRIVATE (PGBUF_PRIVATE_LRU_FROM_THREAD (thread_p));
lru_list = PGBUF_GET_LRU_LIST (private_lru_idx);
victim = pgbuf_get_victim_from_lru_list (thread_p, private_lru_idx);
if (victim != NULL)
{
PERF (PSTAT_PB_OWN_VICTIM_PRIVATE_LRU_SUCCESS);
return victim;
}
/* failed */
if (detailed_perf)
{
PERF (PSTAT_PB_VICTIM_OWN_PRIVATE_LRU_FAIL);
}
}
assert (victim == NULL);
return victim;
#undef PERF
}
/*
* pgbuf_is_bcb_fixed_by_any () - is page fixed by any thread?
*
* return : NO_ERROR
* PGBUF_BCB * bcb (in) : bcb
* has_mutex_lock (in) : true if current thread has lock on bcb
*
* note: if has_mutex_lock is true, even if bcb->latch_mode is not PGBUF_NO_LATCH, we consider this to be temporary.
* this must be during pgbuf_unfix and latch_mode will be set to PGBUF_NO_LATCH before bcb mutex is released.
*/
STATIC_INLINE bool
pgbuf_is_bcb_fixed_by_any (PGBUF_BCB * bcb, bool has_mutex_lock)
{
#if defined (SERVER_MODE)
if (has_mutex_lock)
{
PGBUF_BCB_CHECK_OWN (bcb);
}
/* note: sometimes, the next wait thread could only be threads waiting for flush. however, these are exceptional
* cases. we'd rather miss a few good bcb's from time to time, rather than processing the waiting list for
* every bcb. */
return get_fcnt (&bcb->atomic_latch) > 0 || bcb->next_wait_thrd != NULL || (!has_mutex_lock
&& get_latch (&bcb->atomic_latch) !=
PGBUF_NO_LATCH);
#else /* !SERVER_MODE */
return get_fcnt (&bcb->atomic_latch) != 0;
#endif /* !SERVER_MODE */
}
/*
* pgbuf_is_bcb_victimizable () - check whether bcb can be victimized.
*
* return : true if bcb can be victimized, false otherwise
* bcb (in) : bcb
* has_mutex_lock (in) : true if bcb mutex is owned
*/
STATIC_INLINE bool
pgbuf_is_bcb_victimizable (PGBUF_BCB * bcb, bool has_mutex_lock)
{
/* must not be dirty */
if (pgbuf_bcb_avoid_victim (bcb))
{
return false;
}
#if defined (SERVER_MODE)
/* must not be fixed and must not have waiters. */
if (pgbuf_is_bcb_fixed_by_any (bcb, has_mutex_lock))
{
return false;
}
#endif /* SERVER_MODE */
/* valid */
return true;
}
/*
* pgbuf_get_victim_from_lru_list () - Get victim BCB from the bottom of LRU list
* return: If success, BCB, otherwise NULL
* lru_idx (in) : index of LRU list
*
* Note: This function disconnects BCB from the bottom of the LRU list and returns it if its fcnt == 0.
* If its fcnt != 0, makes bufptr->PrevBCB bottom and retry.
* While this processing, the caller must be the holder of the LRU list.
*/
static PGBUF_BCB *
pgbuf_get_victim_from_lru_list (THREAD_ENTRY * thread_p, const int lru_idx)
{
#define PERF(pstatid) if (perf_tracking) perfmon_inc_stat (thread_p, pstatid)
#define MAX_DEPTH 1000
PGBUF_BCB *bufptr;
int found_victim_cnt = 0;
int search_cnt = 0;
int lru_victim_cnt = 0;
PGBUF_LRU_LIST *lru_list;
PGBUF_BCB *bufptr_victimizable = NULL;
PGBUF_BCB *bufptr_start = NULL;
PGBUF_BCB *victim_hint = NULL;
bool perf_tracking = perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION);
lru_list = &pgbuf_Pool.buf_LRU_list[lru_idx];
PERF (PSTAT_PB_VICTIM_GET_FROM_LRU);
/* check if LRU list is empty */
if (lru_list->count_vict_cand == 0)
{
PERF (PSTAT_PB_VICTIM_GET_FROM_LRU_LIST_WAS_EMPTY);
return NULL;
}
pthread_mutex_lock (&lru_list->mutex);
if (lru_list->bottom == NULL || !PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->bottom))
{
/* no zone 3 */
PERF (PSTAT_PB_VICTIM_GET_FROM_LRU_LIST_WAS_EMPTY);
pthread_mutex_unlock (&lru_list->mutex);
return NULL;
}
if (PGBUF_IS_PRIVATE_LRU_ONE_TWO_OVER_QUOTA (lru_idx))
{
/* first adjust lru1 zone */
pgbuf_lru_adjust_zones (thread_p, lru_list, false);
}
/* search for non dirty bcb */
lru_victim_cnt = lru_list->count_vict_cand;
if (lru_victim_cnt <= 0)
{
/* no victims */
PERF (PSTAT_PB_VICTIM_GET_FROM_LRU_LIST_WAS_EMPTY);
assert (lru_victim_cnt == 0);
pthread_mutex_unlock (&lru_list->mutex);
return NULL;
}
if (!pgbuf_bcb_is_dirty (lru_list->bottom) && lru_list->victim_hint != lru_list->bottom)
{
/* update hint to bottom. sometimes it may be out of sync. */
assert (PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->bottom));
if (PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->bottom))
{
(void) ATOMIC_TAS_ADDR (&lru_list->victim_hint, lru_list->bottom);
}
else
{
(void) ATOMIC_TAS_ADDR (&lru_list->victim_hint, (PGBUF_BCB *) NULL);
}
}
/* we will search */
found_victim_cnt = 0;
bufptr_victimizable = NULL;
/* start searching with victim hint */
victim_hint = lru_list->victim_hint;
if (victim_hint == NULL)
{
bufptr_start = lru_list->bottom;
}
else
{
bufptr_start = victim_hint;
}
for (bufptr = bufptr_start; bufptr != NULL && PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (bufptr) && search_cnt < MAX_DEPTH;
bufptr = bufptr->prev_BCB, search_cnt++)
{
/* must not be any other case that invalidates a victim: is flushing, direct victim */
if (pgbuf_bcb_avoid_victim (bufptr))
{
/* this bcb is not valid for victimization */
continue;
}
/* must not be fixed */
if (pgbuf_is_bcb_fixed_by_any (bufptr, false))
{
/* this bcb cannot be used now, but it is a valid victim candidate. maybe we should update victim hint */
if (bufptr_victimizable == NULL)
{
bufptr_victimizable = bufptr;
/* update hint if this is not bufptr_start and hint has not changed in the meantime. */
if (bufptr_victimizable != victim_hint
&& ATOMIC_CAS_ADDR (&lru_list->victim_hint, victim_hint, bufptr_victimizable))
{
/* hint advanced */
}
assert (lru_list->victim_hint == NULL || PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->victim_hint));
}
found_victim_cnt++;
if (found_victim_cnt >= lru_victim_cnt)
{
/* early out: probably we won't find others */
break;
}
continue;
}
/* a victim candidate. we need to lock its BCB, but since we have LRU mutex, we can only do it conditionally.
* chances are we'll get the mutex. */
if (PGBUF_BCB_TRYLOCK (bufptr) == 0)
{
if (pgbuf_is_bcb_victimizable (bufptr, true))
{
if (bufptr_victimizable == NULL)
{
/* try to update hint to bufptr->prev_BCB */
pgbuf_lru_advance_victim_hint (thread_p, lru_list, victim_hint, bufptr->prev_BCB, false);
}
pgbuf_remove_from_lru_list (thread_p, bufptr, lru_list);
#if defined (SERVER_MODE)
/* todo: this is a hack */
if (pgbuf_Pool.direct_victims.waiter_threads_low_priority->size ()
>= (5 + (thread_num_total_threads () / 20)))
{
pgbuf_panic_assign_direct_victims_from_lru (thread_p, lru_list, bufptr->prev_BCB);
}
#endif /* SERVER_MODE */
if (lru_list->bottom != NULL && pgbuf_bcb_is_dirty (lru_list->bottom)
&& pgbuf_is_page_flush_daemon_available ())
{
/* new bottom is dirty... make sure that flush will wake up */
pgbuf_wakeup_page_flush_daemon (thread_p);
}
pthread_mutex_unlock (&lru_list->mutex);
pgbuf_add_vpid_to_aout_list (thread_p, &bufptr->vpid, lru_idx);
return bufptr;
}
else
{
PGBUF_BCB_UNLOCK (bufptr);
}
}
else
{
/* failed try lock in single-threaded? impossible */
assert (pgbuf_is_page_flush_daemon_available ());
/* save the avoid victim bufptr. maybe it will be reset until we finish the search */
if (bufptr_victimizable == NULL)
{
bufptr_victimizable = bufptr;
/* try to replace victim if it was not already changed. */
if (bufptr != victim_hint && ATOMIC_CAS_ADDR (&lru_list->victim_hint, victim_hint, bufptr_victimizable))
{
/* modified hint */
}
assert (lru_list->victim_hint == NULL || PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->victim_hint));
}
found_victim_cnt++;
if (found_victim_cnt >= lru_victim_cnt)
{
/* early out: probably we won't find others */
break;
}
}
}
PERF (PSTAT_PB_VICTIM_GET_FROM_LRU_FAIL);
if (bufptr_victimizable == NULL && victim_hint != NULL)
{
/* we had a hint and we failed to find any victim candidates. */
PERF (PSTAT_PB_VICTIM_GET_FROM_LRU_BAD_HINT);
assert (PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->bottom));
if (lru_list->count_vict_cand > 0 && PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->bottom))
{
/* set victim hint to bottom */
(void) ATOMIC_CAS_ADDR (&lru_list->victim_hint, victim_hint, lru_list->bottom);
}
else
{
/* no hint */
(void) ATOMIC_CAS_ADDR (&lru_list->victim_hint, victim_hint, (PGBUF_BCB *) NULL);
}
}
pthread_mutex_unlock (&lru_list->mutex);
/* we need more victims */
pgbuf_wakeup_page_flush_daemon (thread_p);
/* failed finding victim in single-threaded, although the number of victim candidates is positive? impossible!
* note: not really impossible. the thread may have the victimizable fixed. but bufptr_victimizable must not be
* NULL. */
assert (pgbuf_is_page_flush_daemon_available () || (bufptr_victimizable != NULL) || (search_cnt == MAX_DEPTH));
return NULL;
#undef PERF
#undef MAX_DEPTH
}
#if defined (SERVER_MODE)
/*
* pgbuf_panic_assign_direct_victims_from_lru () - panic assign direct victims from lru.
*
* return : number of assigned victims.
* thread_p (in) : thread entry
* lru_list (in) : lru list
* bcb_start (in) : starting bcb
*/
static int
pgbuf_panic_assign_direct_victims_from_lru (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, PGBUF_BCB * bcb_start)
{
#define MAX_DEPTH 1000
PGBUF_BCB *bcb = NULL;
int n_assigned = 0;
int count = 0;
/* statistics shows not useful */
if (bcb_start == NULL)
{
return 0;
}
assert (pgbuf_bcb_get_lru_index (bcb_start) == lru_list->index);
/* panic victimization function */
for (bcb = bcb_start;
bcb != NULL && PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (bcb) && lru_list->count_vict_cand > 0 && count < MAX_DEPTH;
bcb = bcb->prev_BCB, count++)
{
assert (pgbuf_bcb_get_lru_index (bcb) == lru_list->index);
if (!pgbuf_is_bcb_victimizable (bcb, false))
{
continue;
}
/* lock mutex. just try. */
if (PGBUF_BCB_TRYLOCK (bcb) != 0)
{
continue;
}
if (!pgbuf_is_bcb_victimizable (bcb, true))
{
PGBUF_BCB_UNLOCK (bcb);
continue;
}
if (!pgbuf_assign_direct_victim (thread_p, bcb))
{
/* no more waiting threads */
PGBUF_BCB_UNLOCK (bcb);
break;
}
/* assigned directly */
PGBUF_BCB_UNLOCK (bcb);
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
perfmon_inc_stat (thread_p, PSTAT_PB_VICTIM_ASSIGN_DIRECT_PANIC);
}
n_assigned++;
}
return n_assigned;
#undef MAX_DEPTH
}
/*
* pgbuf_direct_victims_maintenance () - assign direct victims via searching. the purpose of function is to make sure a
* victim is assigned even when system has low to no activity, which prevents
* bcb's from being assigned to a waiting thread. basically, this is the backup
* plan.
*
* return : void
* thread_p (in) : thread entry
*/
void
pgbuf_direct_victims_maintenance (THREAD_ENTRY * thread_p)
{
#define DEFAULT_ASSIGNS_PER_ITERATION 5
int nassigns = DEFAULT_ASSIGNS_PER_ITERATION;
bool restarted;
int index;
/* note this is designed for single-threaded use only. the static values are used for pick lists with a round-robin
* system */
static int prv_index = 0;
static int shr_index = 0;
/* privates */
for (index = prv_index, restarted = false;
pgbuf_is_any_thread_waiting_for_direct_victim () && nassigns > 0 && index != prv_index && !restarted;
(index == PGBUF_PRIVATE_LRU_COUNT - 1) ? index = 0, restarted = true : index++)
{
pgbuf_lfcq_assign_direct_victims (thread_p, PGBUF_LRU_INDEX_FROM_PRIVATE (index), &nassigns);
}
prv_index = index;
/* shared */
for (index = shr_index, restarted = false;
pgbuf_is_any_thread_waiting_for_direct_victim () && nassigns > 0 && index != shr_index && !restarted;
(index == PGBUF_SHARED_LRU_COUNT - 1) ? index = 0, restarted = true : index++)
{
pgbuf_lfcq_assign_direct_victims (thread_p, index, &nassigns);
}
shr_index = index;
#undef DEFAULT_ASSIGNS_PER_ITERATION
}
/*
* pgbuf_lfcq_assign_direct_victims () - get list from queue and assign victims directly.
*
* return : void
* thread_p (in) : thread entry
* lru_idx (in) : lru index
* nassign_inout (in/out) : update the number of victims to assign
*/
STATIC_INLINE void
pgbuf_lfcq_assign_direct_victims (THREAD_ENTRY * thread_p, int lru_idx, int *nassign_inout)
{
PGBUF_LRU_LIST *lru_list;
PGBUF_BCB *victim_hint = NULL;
int nassigned = 0;
lru_list = PGBUF_GET_LRU_LIST (lru_idx);
if (lru_list->count_vict_cand > 0)
{
pthread_mutex_lock (&lru_list->mutex);
victim_hint = lru_list->victim_hint;
nassigned = pgbuf_panic_assign_direct_victims_from_lru (thread_p, lru_list, victim_hint);
if (nassigned == 0 && lru_list->count_vict_cand > 0 && pgbuf_is_any_thread_waiting_for_direct_victim ())
{
/* maybe hint was bad? that's most likely case. reset the hint to bottom. */
assert (PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->bottom));
if (PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->bottom))
{
(void) ATOMIC_CAS_ADDR (&lru_list->victim_hint, victim_hint, lru_list->bottom);
}
else
{
(void) ATOMIC_CAS_ADDR (&lru_list->victim_hint, victim_hint, (PGBUF_BCB *) NULL);
}
/* check from bottom anyway */
nassigned = pgbuf_panic_assign_direct_victims_from_lru (thread_p, lru_list, lru_list->bottom);
}
pthread_mutex_unlock (&lru_list->mutex);
(*nassign_inout) -= nassigned;
}
}
#endif /* SERVER_MODE */
/*
* pgbuf_lru_add_bcb_to_top () - add a bcb to lru list top
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : bcb added to top
* lru_list (in) : lru list
*/
STATIC_INLINE void
pgbuf_lru_add_bcb_to_top (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, PGBUF_LRU_LIST * lru_list)
{
/* there will be no previous BCB */
bcb->prev_BCB = NULL;
/* next bcb is current top */
bcb->next_BCB = lru_list->top;
/* is list empty? */
if (lru_list->top == NULL)
{
/* yeah. bottom should also be NULL */
assert (lru_list->bottom == NULL);
/* bcb is top and bottom of list */
lru_list->bottom = bcb;
}
else
{
/* update previous top link and change top */
lru_list->top->prev_BCB = bcb;
}
/* we have new top */
lru_list->top = bcb;
if (lru_list->bottom_1 == NULL)
{
/* empty lru 1 zone */
assert (lru_list->count_lru1 == 0);
/* set middle to this bcb */
lru_list->bottom_1 = bcb;
}
/* increment list tick when adding to top */
if (++lru_list->tick_list >= DB_INT32_MAX)
{
lru_list->tick_list = 0;
}
pgbuf_bcb_change_zone (thread_p, bcb, lru_list->index, PGBUF_LRU_1_ZONE);
}
/*
* pgbuf_lru_add_bcb_to_middle () - add a bcb to lru list middle
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : bcb added to middle
* lru_list (in) : lru list
*/
STATIC_INLINE void
pgbuf_lru_add_bcb_to_middle (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, PGBUF_LRU_LIST * lru_list)
{
/* is lru 1 zone empty? */
if (lru_list->bottom_1 == NULL)
{
/* yes, zone 1 is empty */
/* is list empty? */
if (lru_list->top == NULL)
{
/* yes, list is empty. set top and bottom to this bcb. */
assert (lru_list->bottom == NULL);
lru_list->top = bcb;
lru_list->bottom = bcb;
/* null prev/next links */
bcb->prev_BCB = NULL;
bcb->next_BCB = NULL;
}
else
{
/* no. we should add the bcb before top. */
assert (pgbuf_bcb_get_zone (lru_list->top) != PGBUF_LRU_1_ZONE);
assert (lru_list->bottom != NULL);
/* link current top with new bcb */
lru_list->top->prev_BCB = bcb;
bcb->next_BCB = lru_list->top;
/* no previous bcb's */
bcb->prev_BCB = NULL;
/* update top */
lru_list->top = bcb;
}
}
else
{
/* no, zone 1 is not empty */
PGBUF_BCB *bcb_next = lru_list->bottom_1->next_BCB;
assert (lru_list->top != NULL);
assert (lru_list->bottom != NULL);
/* insert after middle */
lru_list->bottom_1->next_BCB = bcb;
bcb->prev_BCB = lru_list->bottom_1;
/* and before bcb_next */
bcb->next_BCB = bcb_next;
/* are zones 2/3 empty? */
if (bcb_next == NULL)
{
/* yes. */
/* middle must be also bottom */
assert (lru_list->bottom == lru_list->bottom_1);
/* update bottom */
lru_list->bottom = bcb;
}
else
{
bcb_next->prev_BCB = bcb;
}
}
if (lru_list->bottom_2 == NULL)
{
assert (lru_list->count_lru2 == 0);
lru_list->bottom_2 = bcb;
}
/* save and increment list tick */
if (++lru_list->tick_list >= DB_INT32_MAX)
{
lru_list->tick_list = 0;
}
pgbuf_bcb_change_zone (thread_p, bcb, lru_list->index, PGBUF_LRU_2_ZONE);
}
/*
* pgbuf_lru_add_bcb_to_bottom () - add a bcb to lru list bottom
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : bcb added to bottom
* lru_list (in) : lru list
*/
STATIC_INLINE void
pgbuf_lru_add_bcb_to_bottom (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, PGBUF_LRU_LIST * lru_list)
{
/* is list empty? */
if (lru_list->bottom == NULL)
{
/* yes, list is empty. top must be NULL */
assert (lru_list->top == NULL);
/* update bottom and top */
lru_list->bottom = bcb;
lru_list->top = bcb;
bcb->prev_BCB = NULL;
bcb->next_BCB = NULL;
/* get tick_lru3 */
bcb->tick_lru3 = lru_list->tick_lru3 - 1;
}
else
{
/* no, list is not empty. added after current bottom. */
lru_list->bottom->next_BCB = bcb;
bcb->prev_BCB = lru_list->bottom;
bcb->next_BCB = NULL;
/* set tick_lru3 smaller that current bottom's */
bcb->tick_lru3 =
PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->bottom) ? lru_list->bottom->tick_lru3 - 1 : lru_list->tick_lru3 - 1;
/* update bottom */
lru_list->bottom = bcb;
}
/* make sure tick_lru3 is not negative */
if (bcb->tick_lru3 < 0)
{
bcb->tick_lru3 += DB_INT32_MAX;
}
pgbuf_bcb_change_zone (thread_p, bcb, lru_list->index, PGBUF_LRU_3_ZONE);
}
/*
* pgbuf_lru_adjust_zone1 () - adjust zone 1 of lru list
*
* return : void
* thread_p (in) : thread entry
* lru_list (in) : lru list
* min_one (in) : true to stop to at least one entry.
*/
STATIC_INLINE void
pgbuf_lru_adjust_zone1 (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, bool min_one)
{
int threshold;
PGBUF_BCB *bcb_bottom;
threshold = lru_list->threshold_lru1;
if (min_one)
{
threshold = MAX (1, threshold);
}
if (threshold >= lru_list->count_lru1)
{
/* no adjustments can be made */
return;
}
assert (lru_list->count_lru1 > 0);
assert (lru_list->bottom_1 != NULL);
/* change bcb zones from 1 to 2 until lru 1 zone count is down to zone 1 desired threshold.
* note: if zone 1 desired threshold is bigger, its bottom is not moved. */
if (lru_list->bottom_2 == NULL)
{
/* bottom 1 will become bottom 2. */
lru_list->bottom_2 = lru_list->bottom_1;
}
for (bcb_bottom = lru_list->bottom_1; threshold < lru_list->count_lru1; bcb_bottom = bcb_bottom->prev_BCB)
{
pgbuf_bcb_change_zone (thread_p, bcb_bottom, lru_list->index, PGBUF_LRU_2_ZONE);
}
/* update bottom of lru 1 */
if (lru_list->count_lru1 == 0)
{
lru_list->bottom_1 = NULL;
}
else
{
assert (bcb_bottom != NULL && pgbuf_bcb_get_zone (bcb_bottom) == PGBUF_LRU_1_ZONE);
lru_list->bottom_1 = bcb_bottom;
}
}
/*
* pgbuf_lru_adjust_zone2 () - adjust zone 2 of lru list based on desired threshold.
*
* return : void
* thread_p (in) : thread entry
* lru_list (in) : lru list
* min_one (in) : true to stop to at least one entry.
*/
STATIC_INLINE void
pgbuf_lru_adjust_zone2 (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, bool min_one)
{
PGBUF_BCB *bcb_bottom;
PGBUF_BCB *bcb_prev;
int threshold;
threshold = lru_list->threshold_lru2;
if (min_one)
{
threshold = MAX (1, threshold);
}
if (threshold >= lru_list->count_lru2)
{
/* no adjustments can be made */
return;
}
assert (lru_list->count_lru2 > 0);
assert (lru_list->bottom_2 != NULL);
assert (pgbuf_bcb_get_zone (lru_list->bottom_2) == PGBUF_LRU_2_ZONE);
/* change bcb zones from 2 to 3 until lru 2 zone count is down to zone 2 desired threshold. */
for (bcb_bottom = lru_list->bottom_2; threshold < lru_list->count_lru2; bcb_bottom = bcb_prev)
{
/* save prev BCB in case this is removed from list */
bcb_prev = bcb_bottom->prev_BCB;
assert (bcb_bottom != NULL && pgbuf_bcb_get_zone (bcb_bottom) == PGBUF_LRU_2_ZONE);
pgbuf_lru_fall_bcb_to_zone_3 (thread_p, bcb_bottom, lru_list);
}
/* update bottom of lru 2 */
if (lru_list->count_lru2 == 0)
{
lru_list->bottom_2 = NULL;
}
else
{
assert (bcb_bottom != NULL && pgbuf_bcb_get_zone (bcb_bottom) == PGBUF_LRU_2_ZONE);
lru_list->bottom_2 = bcb_bottom;
}
}
/*
* pgbuf_lru_adjust_zones () - adjust the middle of lru list and update bcb zones
*
* return : void
* thread_p (in) : thread entry
* lru_list (in) : lru list
* min_one (in) : true to keep at least one entry in 1&2 zones.
*/
STATIC_INLINE void
pgbuf_lru_adjust_zones (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, bool min_one)
{
PGBUF_BCB *bcb_bottom;
PGBUF_BCB *bcb_prev;
int threshold;
/* first adjust zone 1 & 2 and convert to zone 3. then we'll adjust zone 1 (and convert to 2) */
threshold = lru_list->threshold_lru1 + lru_list->threshold_lru2;
if (min_one)
{
threshold = MAX (1, threshold);
}
if (threshold >= PGBUF_LRU_ZONE_ONE_TWO_COUNT (lru_list))
{
/* just try to adjust zone 1. */
pgbuf_lru_adjust_zone1 (thread_p, lru_list, min_one);
return;
}
assert (PGBUF_LRU_ZONE_ONE_TWO_COUNT (lru_list) > 0);
assert (lru_list->bottom_1 != NULL || lru_list->bottom_2 != NULL);
for (bcb_bottom = lru_list->bottom_2 != NULL ? lru_list->bottom_2 : lru_list->bottom_1;
threshold < PGBUF_LRU_ZONE_ONE_TWO_COUNT (lru_list); bcb_bottom = bcb_prev)
{
/* save prev BCB in case this is removed from list */
bcb_prev = bcb_bottom->prev_BCB;
assert (bcb_bottom != NULL && pgbuf_bcb_get_zone (bcb_bottom) != PGBUF_LRU_3_ZONE);
pgbuf_lru_fall_bcb_to_zone_3 (thread_p, bcb_bottom, lru_list);
}
if (lru_list->count_lru2 == 0)
{
lru_list->bottom_2 = NULL;
if (lru_list->count_lru1 == 0)
{
lru_list->bottom_1 = NULL;
}
else
{
assert (bcb_bottom != NULL && pgbuf_bcb_get_zone (bcb_bottom) == PGBUF_LRU_1_ZONE);
lru_list->bottom_1 = bcb_bottom;
}
}
else
{
assert (bcb_bottom != NULL && pgbuf_bcb_get_zone (bcb_bottom) == PGBUF_LRU_2_ZONE);
lru_list->bottom_2 = bcb_bottom;
}
pgbuf_lru_sanity_check (lru_list);
pgbuf_lru_adjust_zone1 (thread_p, lru_list, min_one);
}
/*
* pgbuf_lru_fall_bcb_to_zone_3 () - bcb falls to zone 3 of lru list
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : bcb in lru list
* lru_list (in) : lru list
*/
STATIC_INLINE void
pgbuf_lru_fall_bcb_to_zone_3 (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, PGBUF_LRU_LIST * lru_list)
{
assert (pgbuf_bcb_get_zone (bcb) == PGBUF_LRU_1_ZONE || pgbuf_bcb_get_zone (bcb) == PGBUF_LRU_2_ZONE);
#if defined (SERVER_MODE)
/* can we assign this directly as victim? */
if (pgbuf_is_bcb_victimizable (bcb, false) && pgbuf_is_any_thread_waiting_for_direct_victim ())
{
if (pgbuf_bcb_is_to_vacuum (bcb))
{
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
perfmon_inc_stat (thread_p, PSTAT_PB_VICTIM_ASSIGN_DIRECT_ADJUST_TO_VACUUM);
}
/* fall through */
}
else
{
/* we first need mutex on bcb. however, we'd normally first get mutex on bcb and then on list. since we don't
* want to over complicate things, just try a conditional lock on mutex. if it fails, we'll just give up
* assigning the bcb directly as victim */
if (PGBUF_BCB_TRYLOCK (bcb) == 0)
{
VPID vpid_copy = bcb->vpid;
if (pgbuf_is_bcb_victimizable (bcb, true) && pgbuf_assign_direct_victim (thread_p, bcb))
{
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
perfmon_inc_stat (thread_p, PSTAT_PB_VICTIM_ASSIGN_DIRECT_ADJUST);
}
/* since bcb is going to be removed from list and I have both lru and bcb mutex, why not do it now. */
pgbuf_remove_from_lru_list (thread_p, bcb, lru_list);
PGBUF_BCB_UNLOCK (bcb);
pgbuf_add_vpid_to_aout_list (thread_p, &vpid_copy, lru_list->index);
return;
}
/* not assigned. unlock bcb mutex and fall through */
PGBUF_BCB_UNLOCK (bcb);
}
else
{
/* don't try too hard. it will be victimized eventually. */
/* fall through */
}
}
}
/* not assigned directly */
#endif /* SERVER_MODE */
/* tick_lru3 */
bcb->tick_lru3 = lru_list->tick_lru3;
if (++lru_list->tick_lru3 >= DB_INT32_MAX)
{
lru_list->tick_lru3 = 0;
}
pgbuf_bcb_change_zone (thread_p, bcb, lru_list->index, PGBUF_LRU_3_ZONE);
}
/*
* pgbuf_lru_boost_bcb () - boost bcb.
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : bcb to move to top
*/
static void
pgbuf_lru_boost_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
{
PGBUF_LRU_LIST *lru_list;
PGBUF_ZONE zone = pgbuf_bcb_get_zone (bcb);
bool is_private;
assert (PGBUF_IS_BCB_IN_LRU (bcb));
lru_list = pgbuf_lru_list_from_bcb (bcb);
is_private = PGBUF_IS_PRIVATE_LRU_INDEX (lru_list->index);
/* rules to boosting bcb's in lru lists (also see code in pgbuf_unlatch_bcb_upon_unfix):
* 1. never boost bcb's in zone 1. this is usually the hottest part of the lists and should have a big hit ratio.
* we'd like to avoid locking list mutex and making changes, these bcb's are in no danger of being victimized,
* so we just don't move them.
* 2. avoid boosting new and cold bcb's. a bcb can be fixed/unfixed several times and still be cold. many operations
* will fix a page at least twice (once to read and once to write), and we'd like to avoid boosting the bcb on
* second unfix. we do have a trick to detect such cases. we keep the list tick whenever new bcb's are inserted
* to zones 1 and 2. if a page is quickly fixed several times, its "age" is really small (age being the difference
* between the bcb's saved tick and current list tick), and we don't boost it. it should be unfixed again after
* aging a little before being boosted to top.
* 3. always boost from third zone, since these are decently old.
*
* note: early outs should be handled in pgbuf_unlatch_bcb_upon_unfix.
*/
assert (zone != PGBUF_LRU_1_ZONE);
/* we'll boost. collect stats */
if (zone == PGBUF_LRU_2_ZONE)
{
perfmon_inc_stat (thread_p, is_private ? PSTAT_PB_UNFIX_LRU_TWO_PRV_TO_TOP : PSTAT_PB_UNFIX_LRU_TWO_SHR_TO_TOP);
}
else
{
assert (zone == PGBUF_LRU_3_ZONE);
perfmon_inc_stat (thread_p,
is_private ? PSTAT_PB_UNFIX_LRU_THREE_PRV_TO_TOP : PSTAT_PB_UNFIX_LRU_THREE_SHR_TO_TOP);
}
/* lock list */
pthread_mutex_lock (&lru_list->mutex);
/* remove from current position */
pgbuf_remove_from_lru_list (thread_p, bcb, lru_list);
/* add to top */
pgbuf_lru_add_bcb_to_top (thread_p, bcb, lru_list);
/* since we added a new bcb to lru 1, we should adjust zones */
if (zone == PGBUF_LRU_2_ZONE)
{
/* adjust only zone 1 */
pgbuf_lru_adjust_zone1 (thread_p, lru_list, true);
}
else
{
pgbuf_lru_adjust_zones (thread_p, lru_list, true);
}
pgbuf_lru_sanity_check (lru_list);
/* unlock list */
pthread_mutex_unlock (&lru_list->mutex);
}
/*
* pgbuf_lru_add_new_bcb_to_top () - add a new bcb to top of lru list
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : new bcb
* lru_idx (in) : lru list index
*/
STATIC_INLINE void
pgbuf_lru_add_new_bcb_to_top (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int lru_idx)
{
PGBUF_LRU_LIST *lru_list;
/* this is not meant for changes in this list */
assert (!PGBUF_IS_BCB_IN_LRU (bcb));
/* lock list */
lru_list = &pgbuf_Pool.buf_LRU_list[lru_idx];
pthread_mutex_lock (&lru_list->mutex);
/* add to top */
/* this is new bcb, we must init its list tick */
bcb->tick_lru_list = lru_list->tick_list;
pgbuf_lru_add_bcb_to_top (thread_p, bcb, lru_list);
pgbuf_lru_sanity_check (lru_list);
/* since we added a new bcb to lru 1, we should adjust zones */
pgbuf_lru_adjust_zones (thread_p, lru_list, true);
pgbuf_lru_sanity_check (lru_list);
/* unlock list */
pthread_mutex_unlock (&lru_list->mutex);
}
/*
* pgbuf_lru_add_new_bcb_to_middle () - add a new bcb to middle of lru list
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : new bcb
* lru_idx (in) : lru list index
*/
STATIC_INLINE void
pgbuf_lru_add_new_bcb_to_middle (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int lru_idx)
{
PGBUF_LRU_LIST *lru_list;
/* this is not meant for changes in this list */
assert (!PGBUF_IS_BCB_IN_LRU (bcb));
lru_list = &pgbuf_Pool.buf_LRU_list[lru_idx];
pthread_mutex_lock (&lru_list->mutex);
bcb->tick_lru_list = lru_list->tick_list;
pgbuf_lru_add_bcb_to_middle (thread_p, bcb, lru_list);
pgbuf_lru_sanity_check (lru_list);
/* adjust zone 2 */
pgbuf_lru_adjust_zone2 (thread_p, lru_list, true);
pgbuf_lru_sanity_check (lru_list);
pthread_mutex_unlock (&lru_list->mutex);
}
/*
* pgbuf_lru_add_new_bcb_to_bottom () - add a new bcb to bottom of lru list
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : new bcb
* lru_idx (in) : lru list index
*/
STATIC_INLINE void
pgbuf_lru_add_new_bcb_to_bottom (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int lru_idx)
{
PGBUF_LRU_LIST *lru_list;
/* this is not meant for changes in this list */
assert (!PGBUF_IS_BCB_IN_LRU (bcb));
if (pgbuf_is_bcb_victimizable (bcb, true) && pgbuf_assign_direct_victim (thread_p, bcb))
{
/* assigned directly */
/* TODO: add stat. this is actually not used for now. */
return;
}
/* lock list */
lru_list = &pgbuf_Pool.buf_LRU_list[lru_idx];
pthread_mutex_lock (&lru_list->mutex);
bcb->tick_lru_list = lru_list->tick_list;
pgbuf_lru_add_bcb_to_bottom (thread_p, bcb, lru_list);
pgbuf_lru_sanity_check (lru_list);
/* unlock list */
pthread_mutex_unlock (&lru_list->mutex);
}
/*
* pgbuf_lru_remove_bcb () - remove bcb from lru list
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : bcb
*/
STATIC_INLINE void
pgbuf_lru_remove_bcb (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
{
PGBUF_LRU_LIST *lru_list;
assert (PGBUF_IS_BCB_IN_LRU (bcb));
lru_list = pgbuf_lru_list_from_bcb (bcb);
/* lock list */
pthread_mutex_lock (&lru_list->mutex);
/* remove bcb from list */
pgbuf_remove_from_lru_list (thread_p, bcb, lru_list);
pgbuf_lru_sanity_check (lru_list);
/* unlock list */
pthread_mutex_unlock (&lru_list->mutex);
}
/*
* pgbuf_lru_move_from_private_to_shared () - move a bcb from private list to shared list
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : private list bcb
*/
static void
pgbuf_lru_move_from_private_to_shared (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
{
/* bcb must be in private list */
assert (PGBUF_IS_PRIVATE_LRU_INDEX (pgbuf_bcb_get_lru_index (bcb)));
/* note: from statistics analysis, moves from private to shared are very rare, so we don't inline the function */
/* remove bcb from its lru list */
pgbuf_lru_remove_bcb (thread_p, bcb);
/* add bcb to middle of shared list */
pgbuf_lru_add_new_bcb_to_middle (thread_p, bcb, pgbuf_get_shared_lru_index_for_add ());
pgbuf_bcb_register_hit_for_lru (bcb);
}
/*
* pgbuf_remove_from_lru_list () - Remove a BCB from the LRU list
* return : void
* bufptr (in) : BCB
* lru_list (in) : LRU list to which BVB currently belongs to
*
* Note: The caller MUST hold the LRU list mutex.
*/
STATIC_INLINE void
pgbuf_remove_from_lru_list (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, PGBUF_LRU_LIST * lru_list)
{
PGBUF_BCB *bcb_prev = NULL;
if (lru_list->top == bufptr)
{
lru_list->top = bufptr->next_BCB;
}
if (lru_list->bottom == bufptr)
{
lru_list->bottom = bufptr->prev_BCB;
}
if (lru_list->bottom_1 == bufptr)
{
lru_list->bottom_1 = bufptr->prev_BCB;
}
if (lru_list->bottom_2 == bufptr)
{
if (bufptr->prev_BCB != NULL && pgbuf_bcb_get_zone (bufptr->prev_BCB) == PGBUF_LRU_2_ZONE)
{
lru_list->bottom_2 = bufptr->prev_BCB;
}
else
{
assert (lru_list->count_lru2 == 1);
lru_list->bottom_2 = NULL;
}
}
if (bufptr->next_BCB != NULL)
{
(bufptr->next_BCB)->prev_BCB = bufptr->prev_BCB;
}
bcb_prev = bufptr->prev_BCB;
if (bcb_prev != NULL)
{
bcb_prev->next_BCB = bufptr->next_BCB;
}
bufptr->prev_BCB = NULL;
bufptr->next_BCB = NULL;
/* we need to update the victim hint now, since bcb has been disconnected from list.
* pgbuf_lru_remove_victim_candidate will not which is the previous BCB. we cannot change the hint before
* disconnecting the bcb from list, we need to be sure no one else sets the hint to this bcb. */
pgbuf_lru_advance_victim_hint (thread_p, lru_list, bufptr, bcb_prev, false);
/* update zone */
pgbuf_bcb_change_zone (thread_p, bufptr, 0, PGBUF_VOID_ZONE);
}
/*
* pgbuf_move_bcb_to_bottom_lru () - move a bcb to the bottom of its lru (or other lru if it is in the void zone).
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : bcb
*/
static void
pgbuf_move_bcb_to_bottom_lru (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
{
PGBUF_ZONE zone = pgbuf_bcb_get_zone (bcb);
int lru_idx;
PGBUF_LRU_LIST *lru_list;
pgbuf_bcb_update_flags (thread_p, bcb, 0, PGBUF_BCB_MOVE_TO_LRU_BOTTOM_FLAG);
if (zone == PGBUF_VOID_ZONE)
{
/* move to the bottom of a lru list so it can be found by flush thread */
if (PGBUF_THREAD_HAS_PRIVATE_LRU (thread_p))
{
lru_idx = PGBUF_LRU_INDEX_FROM_PRIVATE (PGBUF_PRIVATE_LRU_FROM_THREAD (thread_p));
}
else
{
lru_idx = pgbuf_get_shared_lru_index_for_add ();
}
pgbuf_lru_add_new_bcb_to_bottom (thread_p, bcb, lru_idx);
}
else if (zone & PGBUF_LRU_ZONE_MASK)
{
lru_idx = pgbuf_bcb_get_lru_index (bcb);
lru_list = PGBUF_GET_LRU_LIST (lru_idx);
if (bcb == lru_list->bottom)
{
/* early out */
return;
}
pthread_mutex_lock (&lru_list->mutex);
pgbuf_remove_from_lru_list (thread_p, bcb, lru_list);
pgbuf_lru_add_bcb_to_bottom (thread_p, bcb, lru_list);
pthread_mutex_unlock (&lru_list->mutex);
}
else
{
assert (false);
}
}
/*
* pgbuf_add_vpid_to_aout_list () - add VPID to Aout list
* return : void
* thread_p (in) :
* vpid (in) :
* lru_idx (in) : LRU index in which the VPID had been
*/
STATIC_INLINE void
pgbuf_add_vpid_to_aout_list (THREAD_ENTRY * thread_p, const VPID * vpid, const int lru_idx)
{
#if defined(SERVER_MODE)
int rv;
#endif /* SERVER_MODE */
PGBUF_AOUT_LIST *list;
PGBUF_AOUT_BUF *aout_buf;
int hash_idx = 0;
if (pgbuf_Pool.buf_AOUT_list.max_count <= 0)
{
return;
}
assert (!VPID_ISNULL (vpid));
list = &pgbuf_Pool.buf_AOUT_list;
rv = pthread_mutex_lock (&pgbuf_Pool.buf_AOUT_list.Aout_mutex);
if (list->Aout_free == NULL)
{
assert (list->Aout_bottom != NULL);
/* disconnect the bottom */
aout_buf = list->Aout_bottom;
if (list->Aout_bottom->prev == NULL)
{
assert (false);
}
list->Aout_bottom = list->Aout_bottom->prev;
list->Aout_bottom->next = NULL;
/* also remove entry from hash table */
hash_idx = AOUT_HASH_IDX (&aout_buf->vpid, list);
mht_rem (list->aout_buf_ht[hash_idx], &aout_buf->vpid, NULL, NULL);
}
else
{
aout_buf = list->Aout_free;
list->Aout_free = list->Aout_free->next;
}
aout_buf->next = NULL;
aout_buf->prev = NULL;
aout_buf->lru_idx = lru_idx;
VPID_COPY (&aout_buf->vpid, vpid);
/* add to hash */
hash_idx = AOUT_HASH_IDX (&aout_buf->vpid, list);
mht_put (list->aout_buf_ht[hash_idx], &aout_buf->vpid, aout_buf);
if (list->Aout_top == NULL)
{
/* this is the only page in the Aout list */
assert (list->Aout_bottom == NULL);
aout_buf->next = NULL;
aout_buf->prev = NULL;
list->Aout_top = aout_buf;
list->Aout_bottom = aout_buf;
}
else
{
aout_buf->next = list->Aout_top;
list->Aout_top->prev = aout_buf;
list->Aout_top = aout_buf;
}
pthread_mutex_unlock (&pgbuf_Pool.buf_AOUT_list.Aout_mutex);
}
/*
* pgbuf_remove_vpid_from_aout_list () - Search for VPID in Aout and remove it from the queue
* return : identifier of list from which was removed:
* 0 and positive: LRU list
* PGBUF_AOUT_NOT_FOUND: not found in Aout list
* thread_p (in) :
* vpid (in) :
*/
static int
pgbuf_remove_vpid_from_aout_list (THREAD_ENTRY * thread_p, const VPID * vpid)
{
#if defined(SERVER_MODE)
int rv;
#endif /* SERVER_MODE */
PGBUF_AOUT_BUF *aout_buf;
int hash_idx;
int aout_list_id = PGBUF_AOUT_NOT_FOUND;
if (pgbuf_Pool.buf_AOUT_list.max_count <= 0)
{
/* Aout list not used */
return PGBUF_AOUT_NOT_FOUND;
}
hash_idx = AOUT_HASH_IDX (vpid, (&pgbuf_Pool.buf_AOUT_list));
rv = pthread_mutex_lock (&pgbuf_Pool.buf_AOUT_list.Aout_mutex);
/* Search the vpid in the hash table */
aout_buf = (PGBUF_AOUT_BUF *) mht_get (pgbuf_Pool.buf_AOUT_list.aout_buf_ht[hash_idx], vpid);
if (aout_buf == NULL)
{
/* Not there, just return */
pthread_mutex_unlock (&pgbuf_Pool.buf_AOUT_list.Aout_mutex);
return PGBUF_AOUT_NOT_FOUND;
}
/* We can assume that aout_buf is what we're looking for if it still has the same VPID as before acquiring the mutex.
* The reason for this is that nobody can change it while we're holding the mutex. Any changes must be visible before
* we acquire this mutex */
aout_list_id = aout_buf->lru_idx;
if (aout_buf == pgbuf_Pool.buf_AOUT_list.Aout_bottom)
{
pgbuf_Pool.buf_AOUT_list.Aout_bottom = pgbuf_Pool.buf_AOUT_list.Aout_bottom->prev;
if (pgbuf_Pool.buf_AOUT_list.Aout_bottom != NULL)
{
pgbuf_Pool.buf_AOUT_list.Aout_bottom->next = NULL;
}
aout_buf->prev = NULL;
}
if (aout_buf == pgbuf_Pool.buf_AOUT_list.Aout_top)
{
pgbuf_Pool.buf_AOUT_list.Aout_top = pgbuf_Pool.buf_AOUT_list.Aout_top->next;
if (pgbuf_Pool.buf_AOUT_list.Aout_top != NULL)
{
pgbuf_Pool.buf_AOUT_list.Aout_top->prev = NULL;
}
aout_buf->next = NULL;
}
if (aout_buf->prev != NULL)
{
aout_buf->prev->next = aout_buf->next;
}
if (aout_buf->next != NULL)
{
aout_buf->next->prev = aout_buf->prev;
}
/* remove vpid from hash */
mht_rem (pgbuf_Pool.buf_AOUT_list.aout_buf_ht[hash_idx], vpid, NULL, NULL);
/* add to free list */
VPID_SET_NULL (&aout_buf->vpid);
aout_buf->lru_idx = PGBUF_AOUT_NOT_FOUND;
aout_buf->next = NULL;
aout_buf->prev = NULL;
aout_buf->next = pgbuf_Pool.buf_AOUT_list.Aout_free;
pgbuf_Pool.buf_AOUT_list.Aout_free = aout_buf;
pthread_mutex_unlock (&pgbuf_Pool.buf_AOUT_list.Aout_mutex);
return aout_list_id;
}
/*
* pgbuf_remove_private_from_aout_list () - Search for VPID in Aout and removes all VPIDs having a specific LRU idx
*
* return : number of VPIDs removed
* lru_idx (in) :
*/
static int
pgbuf_remove_private_from_aout_list (const int lru_idx)
{
PGBUF_AOUT_BUF *aout_buf;
PGBUF_AOUT_BUF *aout_buf_next;
int hash_idx;
int cnt_removed = 0;
if (pgbuf_Pool.buf_AOUT_list.max_count <= 0)
{
/* Aout list not used */
return cnt_removed;
}
pthread_mutex_lock (&pgbuf_Pool.buf_AOUT_list.Aout_mutex);
aout_buf = pgbuf_Pool.buf_AOUT_list.Aout_top;
while (aout_buf != NULL)
{
if (aout_buf->lru_idx != lru_idx)
{
aout_buf = aout_buf->next;
continue;
}
aout_buf_next = aout_buf->next;
/* remove this item */
if (aout_buf == pgbuf_Pool.buf_AOUT_list.Aout_bottom)
{
pgbuf_Pool.buf_AOUT_list.Aout_bottom = pgbuf_Pool.buf_AOUT_list.Aout_bottom->prev;
if (pgbuf_Pool.buf_AOUT_list.Aout_bottom != NULL)
{
pgbuf_Pool.buf_AOUT_list.Aout_bottom->next = NULL;
}
aout_buf->prev = NULL;
}
if (aout_buf == pgbuf_Pool.buf_AOUT_list.Aout_top)
{
pgbuf_Pool.buf_AOUT_list.Aout_top = pgbuf_Pool.buf_AOUT_list.Aout_top->next;
if (pgbuf_Pool.buf_AOUT_list.Aout_top != NULL)
{
pgbuf_Pool.buf_AOUT_list.Aout_top->prev = NULL;
}
aout_buf->next = NULL;
}
if (aout_buf->prev != NULL)
{
aout_buf->prev->next = aout_buf->next;
}
if (aout_buf->next != NULL)
{
aout_buf->next->prev = aout_buf->prev;
}
hash_idx = AOUT_HASH_IDX (&aout_buf->vpid, (&pgbuf_Pool.buf_AOUT_list));
mht_rem (pgbuf_Pool.buf_AOUT_list.aout_buf_ht[hash_idx], &aout_buf->vpid, NULL, NULL);
/* add to free list */
VPID_SET_NULL (&aout_buf->vpid);
aout_buf->lru_idx = PGBUF_AOUT_NOT_FOUND;
aout_buf->next = NULL;
aout_buf->prev = NULL;
aout_buf->next = pgbuf_Pool.buf_AOUT_list.Aout_free;
pgbuf_Pool.buf_AOUT_list.Aout_free = aout_buf;
aout_buf = aout_buf_next;
cnt_removed++;
}
pthread_mutex_unlock (&pgbuf_Pool.buf_AOUT_list.Aout_mutex);
return cnt_removed;
}
/*
* pgbuf_bcb_flush_with_wal () - write a buffer page to disk.
*
* return : error code
* thread_p (in) : thread entry
* bufptr (in) : bcb
* is_page_flush_thread (in) : true if caller is page flush thread. false otherwise.
* is_bcb_locked (out) : output whether bcb remains locked or not.
*/
STATIC_INLINE int
pgbuf_bcb_flush_with_wal (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, bool is_page_flush_thread, bool * is_bcb_locked)
{
char page_buf[IO_MAX_PAGE_SIZE + MAX_ALIGNMENT];
FILEIO_PAGE *iopage = NULL;
PAGE_PTR pgptr = NULL;
LOG_LSA oldest_unflush_lsa;
int error = NO_ERROR;
#if defined(ENABLE_SYSTEMTAP)
QUERY_ID query_id = NULL_QUERY_ID;
bool monitored = false;
#endif /* ENABLE_SYSTEMTAP */
bool was_dirty = false, uses_dwb;
DWB_SLOT *dwb_slot = NULL;
LOG_LSA lsa;
FILEIO_WRITE_MODE write_mode;
bool is_temp = pgbuf_is_temporary_volume (bufptr->vpid.volid);
TDE_ALGORITHM tde_algo = TDE_ALGORITHM_NONE;
int tran_index = LOG_FIND_THREAD_TRAN_INDEX (thread_p);
PGBUF_STATUS *show_status = &pgbuf_Pool.show_status[tran_index];
PGBUF_BCB_CHECK_OWN (bufptr);
/* the caller is holding bufptr->mutex */
*is_bcb_locked = true;
assert (get_latch (&bufptr->atomic_latch) == PGBUF_NO_LATCH || get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_READ
|| get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_WRITE);
#if !defined (NDEBUG) && defined (SERVER_MODE)
if (get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_WRITE)
{
/* I must be the owner, or else we'll be in trouble. */
int thread_index = thread_p->index;
PGBUF_HOLDER_ANCHOR *thrd_holder_info = &pgbuf_Pool.thrd_holder_info[thread_index];
PGBUF_HOLDER *holder = NULL;
/* Search for bufptr in current thread holder list. */
for (holder = thrd_holder_info->thrd_hold_list; holder != NULL; holder = holder->thrd_link)
{
if (holder->bufptr == bufptr)
{
break;
}
}
/* Safe guard: I must be the bufptr holder. */
assert (holder != NULL);
}
#endif /* !NDEBUG */
/* how this works:
*
* caller should already have bcb locked. we don't do checks of opportunity or correctness here (that's up to the
* caller).
*
* we copy the page and save oldest_unflush_lsa and then we try to write the page to disk. if writing fails, we
* "revert" changes (restore dirty flag and oldest_unflush_lsa).
*
* if successful, we choose one of the paths:
* 1. send the page to post-flush to process it and assign it directly (if this is page flush thread and victimization
* system is stressed).
* 2. lock bcb again, clear is flushing status, wake up of threads waiting for flush and return.
*/
if (pgbuf_check_bcb_page_vpid (bufptr, false) != true)
{
assert (false);
return ER_FAILED;
}
was_dirty = pgbuf_bcb_mark_is_flushing (thread_p, bufptr);
uses_dwb = dwb_is_created () && !is_temp;
start_copy_page:
iopage = (FILEIO_PAGE *) PTR_ALIGN (page_buf, MAX_ALIGNMENT);
CAST_BFPTR_TO_PGPTR (pgptr, bufptr);
tde_algo = pgbuf_get_tde_algorithm (pgptr);
if (tde_algo != TDE_ALGORITHM_NONE)
{
error = tde_encrypt_data_page (&bufptr->iopage_buffer->iopage, tde_algo, is_temp, iopage);
if (error != NO_ERROR)
{
ASSERT_ERROR ();
return error;
}
}
else
{
memcpy ((void *) iopage, (void *) (&bufptr->iopage_buffer->iopage), IO_PAGESIZE);
}
if (uses_dwb)
{
error = dwb_set_data_on_next_slot (thread_p, iopage, false, false, &dwb_slot);
if (error != NO_ERROR)
{
return error;
}
if (dwb_slot != NULL)
{
iopage = NULL;
goto copy_unflushed_lsa;
}
}
copy_unflushed_lsa:
LSA_COPY (&lsa, &(bufptr->iopage_buffer->iopage.prv.lsa));
LSA_COPY (&oldest_unflush_lsa, &bufptr->oldest_unflush_lsa);
LSA_SET_NULL (&bufptr->oldest_unflush_lsa);
PGBUF_BCB_UNLOCK (bufptr);
*is_bcb_locked = false;
if (!LSA_ISNULL (&oldest_unflush_lsa))
{
/* confirm WAL protocol */
/* force log record to disk */
logpb_flush_log_for_wal (thread_p, &lsa);
}
else
{
/* if page was changed, the change was not logged. this is a rare case, but can happen. */
if (!pgbuf_is_temporary_volume (bufptr->vpid.volid))
{
er_log_debug (ARG_FILE_LINE, "flushing page %d|%d to disk without logging.\n", VPID_AS_ARGS (&bufptr->vpid));
}
}
#if defined(ENABLE_SYSTEMTAP)
query_id = qmgr_get_current_query_id (thread_p);
if (query_id != NULL_QUERY_ID)
{
monitored = true;
CUBRID_IO_WRITE_START (query_id);
}
#endif /* ENABLE_SYSTEMTAP */
/* Activating/deactivating DWB while the server is alive, needs additional work. For now, we don't care about
* this case, we can use it to test performance differences.
*/
if (uses_dwb)
{
error = dwb_add_page (thread_p, iopage, &bufptr->vpid, false, &dwb_slot);
if (error == NO_ERROR)
{
if (dwb_slot == NULL)
{
/* DWB disabled meanwhile, try again without DWB. */
uses_dwb = false;
PGBUF_BCB_LOCK (bufptr);
*is_bcb_locked = true;
goto start_copy_page;
}
}
}
else
{
show_status->num_pages_written++;
/* Record number of writes in statistics */
write_mode = (dwb_is_created () == true ? FILEIO_WRITE_NO_COMPENSATE_WRITE : FILEIO_WRITE_DEFAULT_WRITE);
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_IOWRITES);
if (fileio_write (thread_p, fileio_get_volume_descriptor (bufptr->vpid.volid), iopage, bufptr->vpid.pageid,
IO_PAGESIZE, write_mode) == NULL)
{
error = ER_FAILED;
}
}
#if defined(ENABLE_SYSTEMTAP)
if (monitored == true)
{
CUBRID_IO_WRITE_END (query_id, IO_PAGESIZE, (error != NO_ERROR));
}
#endif /* ENABLE_SYSTEMTAP */
if (error != NO_ERROR)
{
PGBUF_BCB_LOCK (bufptr);
*is_bcb_locked = true;
pgbuf_bcb_mark_was_not_flushed (thread_p, bufptr, was_dirty);
LSA_COPY (&bufptr->oldest_unflush_lsa, &oldest_unflush_lsa);
#if defined (SERVER_MODE)
if (bufptr->next_wait_thrd != NULL)
{
pgbuf_wake_flush_waiters (thread_p, bufptr);
}
#endif
return ER_FAILED;
}
assert (get_latch (&bufptr->atomic_latch) != PGBUF_LATCH_FLUSH);
#if defined (SERVER_MODE)
/* if the flush thread is under pressure, we'll move some of the workload to post-flush thread. */
if (is_page_flush_thread && (pgbuf_Page_post_flush_daemon != NULL)
&& pgbuf_is_any_thread_waiting_for_direct_victim () && pgbuf_Pool.flushed_bcbs->produce (bufptr))
{
/* page buffer maintenance thread will try to assign this bcb directly as victim. */
pgbuf_Page_post_flush_daemon->wakeup ();
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
perfmon_inc_stat (thread_p, PSTAT_PB_FLUSH_SEND_DIRTY_TO_POST_FLUSH);
}
}
else
#endif /* SERVER_MODE */
{
PGBUF_BCB_LOCK (bufptr);
*is_bcb_locked = true;
pgbuf_bcb_mark_was_flushed (thread_p, bufptr);
#if defined (SERVER_MODE)
if (bufptr->next_wait_thrd != NULL)
{
pgbuf_wake_flush_waiters (thread_p, bufptr);
}
#endif
}
if (perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
perfmon_inc_stat (thread_p, PSTAT_PB_FLUSH_PAGE_FLUSHED);
}
return NO_ERROR;
}
/*
* pgbuf_wake_flush_waiters () - wake up all threads waiting for flush
*
* return : void
* thread_p (in) : thread entry
* bcb (in) : flushed bcb
*/
static void
pgbuf_wake_flush_waiters (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
{
#if defined (SERVER_MODE)
THREAD_ENTRY *prev_waiter = NULL;
THREAD_ENTRY *crt_waiter = NULL;
THREAD_ENTRY *save_next_waiter = NULL;
PERF_UTIME_TRACKER timetr;
PERF_UTIME_TRACKER_START (thread_p, &timetr);
PGBUF_BCB_CHECK_OWN (bcb);
for (crt_waiter = bcb->next_wait_thrd; crt_waiter != NULL; crt_waiter = save_next_waiter)
{
save_next_waiter = crt_waiter->next_wait_thrd;
if (crt_waiter->request_latch_mode == PGBUF_LATCH_FLUSH)
{
/* wakeup and remove from list */
if (prev_waiter != NULL)
{
prev_waiter->next_wait_thrd = save_next_waiter;
}
else
{
bcb->next_wait_thrd = save_next_waiter;
}
crt_waiter->next_wait_thrd = NULL;
pgbuf_wakeup_uncond (crt_waiter);
}
else
{
prev_waiter = crt_waiter;
}
}
PERF_UTIME_TRACKER_TIME (thread_p, &timetr, PSTAT_PB_WAKE_FLUSH_WAITER);
#endif /* SERVER_MODE */
}
/*
* pgbuf_is_exist_blocked_reader_writer () - checks whether there exists any blocked reader/writer
* return: if found, true, otherwise, false
* bufptr(in): pointer to buffer page
*/
STATIC_INLINE bool
pgbuf_is_exist_blocked_reader_writer (PGBUF_BCB * bufptr)
{
#if defined(SERVER_MODE)
THREAD_ENTRY *thrd_entry;
/* check whether there exists any blocked reader/writer */
thrd_entry = bufptr->next_wait_thrd;
while (thrd_entry != NULL)
{
if (thrd_entry->request_latch_mode == PGBUF_LATCH_READ || thrd_entry->request_latch_mode == PGBUF_LATCH_WRITE)
{
return true;
}
thrd_entry = thrd_entry->next_wait_thrd;
}
#endif /* SERVER_MODE */
return false;
}
/*
* pgbuf_get_check_page_validation_level -
* return:
*
*/
STATIC_INLINE bool
pgbuf_get_check_page_validation_level (int page_validation_level)
{
#if !defined(NDEBUG)
return prm_get_integer_value (PRM_ID_PB_DEBUG_PAGE_VALIDATION_LEVEL) >= page_validation_level;
#else /* NDEBUG */
return false;
#endif /* NDEBUG */
}
/*
* pgbuf_is_valid_page () - Verify if given page is a valid one
* return: either: DISK_INVALID, DISK_VALID, DISK_ERROR
* vpid(in): Complete Page identifier
*
* Note: Verify that the given page is valid according to functions:
* 1) disk_isvalid_page
* 2) given fun2 is any
* The function is a NOOP if we are not running with full debugging
* capabilities.
*/
DISK_ISVALID
pgbuf_is_valid_page (THREAD_ENTRY * thread_p, const VPID * vpid, bool no_error)
{
DISK_ISVALID valid;
if (fileio_get_volume_label (vpid->volid, PEEK) == NULL || VPID_ISNULL (vpid))
{
assert (no_error);
return DISK_INVALID;
}
/*valid = disk_isvalid_page (thread_p, vpid->volid, vpid->pageid); */
valid = disk_is_page_sector_reserved_with_debug_crash (thread_p, vpid->volid, vpid->pageid, !no_error);
if (valid == DISK_INVALID && !no_error)
{
er_set (ER_FATAL_ERROR_SEVERITY, ARG_FILE_LINE, ER_PB_BAD_PAGEID, 2, vpid->pageid,
fileio_get_volume_label (vpid->volid, PEEK));
assert (false);
}
return valid;
}
/*
* pgbuf_is_valid_page_ptr () - Validate an in-memory page pointer
* return: true/false
* pgptr(in): Pointer to page
*
* Note: Verify if the given page pointer points to the beginning of a
* in-memory page pointer. This function is used for debugging purposes.
*/
static bool
pgbuf_is_valid_page_ptr (const PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr;
int bufid;
assert (pgptr != NULL);
/* NOTE: Does not need to hold mutex since the page is fixed */
for (bufid = 0; bufid < pgbuf_Pool.num_buffers; bufid++)
{
bufptr = PGBUF_FIND_BCB_PTR (bufid);
PGBUF_BCB_LOCK (bufptr);
if (((PAGE_PTR) (&(bufptr->iopage_buffer->iopage.page[0]))) == pgptr)
{
if (get_fcnt (&bufptr->atomic_latch) <= 0)
{
/* This situation must not be occurred. */
assert (false);
er_set (ER_FATAL_ERROR_SEVERITY, ARG_FILE_LINE, ER_PB_UNFIXED_PAGEPTR, 3, pgptr, bufptr->vpid.pageid,
fileio_get_volume_label (bufptr->vpid.volid, PEEK));
PGBUF_BCB_UNLOCK (bufptr);
return false;
}
else
{
PGBUF_BCB_UNLOCK (bufptr);
return true;
}
}
else
{
PGBUF_BCB_UNLOCK (bufptr);
}
}
er_set (ER_FATAL_ERROR_SEVERITY, ARG_FILE_LINE, ER_PB_UNKNOWN_PAGEPTR, 1, pgptr);
assert (false);
return false;
}
/*
* pgbuf_check_page_type () - Check the page type is as expected. If it isn't an assert will be hit.
*
* return : True if the page type is as expected.
* thread_p (in) : Thread entry.
* pgptr (in) : Pointer to buffer page.
* ptype (in) : Expected page type.
*/
bool
pgbuf_check_page_ptype (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, PAGE_TYPE ptype)
{
return pgbuf_check_page_ptype_internal (pgptr, ptype, false);
}
/*
* pgbuf_check_page_type_no_error () - Return if the page type is the expected type given as argument. No assert is
* hit if not.
*
* return : True if the page type is as expected.
* thread_p (in) : Thread entry.
* pgptr (in) : Pointer to buffer page.
* ptype (in) : Expected page type.
*/
bool
pgbuf_check_page_type_no_error (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, PAGE_TYPE ptype)
{
return pgbuf_check_page_ptype_internal (pgptr, ptype, true);
}
/*
* pgbuf_check_page_ptype_internal () -
* return: true/false
* bufptr(in): pointer to buffer page
* ptype(in): page type
*
* Note: Verify if the given page's ptype is valid.
* This function is used for debugging purposes.
*/
STATIC_INLINE bool
pgbuf_check_page_ptype_internal (PAGE_PTR pgptr, PAGE_TYPE ptype, bool no_error)
{
PGBUF_BCB *bufptr;
if (pgptr == NULL)
{
assert (false);
return false;
}
/* TODO - do not delete me */
#if defined(NDEBUG)
if (log_is_in_crash_recovery ())
{
return true;
}
#endif
if (pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
if (pgbuf_is_valid_page_ptr (pgptr) == false)
{
return false;
}
}
/* NOTE: Does not need to hold mutex since the page is fixed */
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
assert (!VPID_ISNULL (&bufptr->vpid));
if (pgbuf_check_bcb_page_vpid (bufptr, false) == true)
{
if (bufptr->iopage_buffer->iopage.prv.ptype != PAGE_UNKNOWN && bufptr->iopage_buffer->iopage.prv.ptype != ptype)
{
assert_release (no_error);
return false;
}
}
else
{
assert_release (false);
return false;
}
return true;
}
/*
* pgbuf_check_bcb_page_vpid () - Validate an FILEIO_PAGE prv
* return: true/false
* bufptr(in): pointer to buffer page
* maybe_deallocated(in) : true, if page may be deallocated
*
* Note: Verify if the given page's prv is valid.
* This function is used for debugging purposes.
*/
STATIC_INLINE bool
pgbuf_check_bcb_page_vpid (PGBUF_BCB * bufptr, bool maybe_deallocated)
{
if (bufptr == NULL || VPID_ISNULL (&bufptr->vpid))
{
assert (bufptr != NULL);
assert (!VPID_ISNULL (&bufptr->vpid));
return false;
}
/* perm volume */
if (bufptr->vpid.volid > NULL_VOLID)
{
/* Check Page identifier */
assert ((maybe_deallocated && log_is_in_crash_recovery_and_not_yet_completes_redo ())
|| (bufptr->vpid.pageid == bufptr->iopage_buffer->iopage.prv.pageid
&& bufptr->vpid.volid == bufptr->iopage_buffer->iopage.prv.volid));
assert (bufptr->iopage_buffer->iopage.prv.p_reserve_1 == 0);
assert (bufptr->iopage_buffer->iopage.prv.p_reserve_2 == 0);
return (bufptr->vpid.pageid == bufptr->iopage_buffer->iopage.prv.pageid
&& bufptr->vpid.volid == bufptr->iopage_buffer->iopage.prv.volid);
}
else
{
return true; /* nop */
}
}
#if defined(CUBRID_DEBUG)
/*
* pgbuf_scramble () - Scramble the content of the buffer
* return: void
* iopage(in): Pointer to page portion
*
* Note: This is done for debugging reasons to make sure that a user of a
* buffer does not assume that buffers are initialized to zero. For safty
* reasons, the buffers are initialized to zero, instead of scrambled,
* when running in production mode.
*/
static void
pgbuf_scramble (FILEIO_PAGE * iopage)
{
MEM_REGION_INIT (iopage, IO_PAGESIZE);
fileio_init_lsa_of_page (iopage, IO_PAGESIZE);
/* Init Page identifier */
iopage->prv.pageid = -1;
iopage->prv.volid = -1;
iopage->prv.ptype = (unsigned char) PAGE_UNKNOWN;
iopage->prv.pflag = '\0';
iopage->prv.p_reserve_1 = 0;
iopage->prv.p_reserve_2 = 0;
iopage->prv.tde_nonce = 0;
}
/*
* pgbuf_dump_if_any_fixed () - Dump buffer pool if any page buffer is fixed
* return: void
*
* Note: This is a debugging function that can be used to verify if buffers
* were freed after a set of operations (e.g., a request or a API
* function).
* This function will not give you good results when there are multiple
* users in the system (multiprocessing)
*/
void
pgbuf_dump_if_any_fixed (void)
{
PGBUF_BCB *bufptr;
int bufid;
int consistent = PGBUF_CONTENT_GOOD;
#if defined(SERVER_MODE)
int rv;
#endif /* SERVER_MODE */
/* Make sure that each buffer is unfixed and consistent */
for (bufid = 0; bufid < pgbuf_Pool.num_buffers; bufid++)
{
bufptr = PGBUF_FIND_BCB_PTR (bufid);
PGBUF_BCB_LOCK (bufptr);
if (get_latch (&bufptr->atomic_latch) != PGBUF_LATCH_INVALID && get_fcnt (&bufptr->atomic_latch) > 0)
{
/* The buffer is not unfixed */
PGBUF_BCB_UNLOCK (bufptr);
pgbuf_dump ();
return;
}
consistent = pgbuf_is_consistent (bufptr, 0);
PGBUF_BCB_UNLOCK (bufptr);
if (consistent == PGBUF_CONTENT_BAD)
{
break;
}
}
if (consistent != PGBUF_CONTENT_GOOD)
{
pgbuf_dump ();
}
}
/*
* pgbuf_dump () - Dump the system area of each buffer
* return: void
*
* Note: This function is used for debugging purposes
*/
static void
pgbuf_dump (void)
{
PGBUF_BCB *bufptr;
int bufid, i;
int consistent;
int nfetched = 0;
int ndirty = 0;
const char *latch_mode_str, *zone_str, *consistent_str;
#if defined(SERVER_MODE)
int rv;
#endif /* SERVER_MODE */
(void) fflush (stderr);
(void) fflush (stdout);
(void) fprintf (stdout, "\n\n");
(void) fprintf (stdout, "Num buffers = %d\n", pgbuf_Pool.num_buffers);
/* Dump info cached about perm and tmp volume identifiers */
rv = pthread_mutex_lock (&pgbuf_Pool.volinfo_mutex);
(void) fprintf (stdout, "Lastperm volid = %d, Num permvols of tmparea = %d\n", pgbuf_Pool.last_perm_volid,
pgbuf_Pool.num_permvols_tmparea);
if (pgbuf_Pool.permvols_tmparea_volids != NULL)
{
(void) fprintf (stdout, "Permanent volumes with tmp area: ");
for (i = 0; i < pgbuf_Pool.num_permvols_tmparea; i++)
{
if (i != 0)
{
(void) fprintf (stdout, ", ");
}
(void) fprintf (stdout, "%d", pgbuf_Pool.permvols_tmparea_volids[i]);
}
(void) fprintf (stdout, "\n");
}
pthread_mutex_unlock (&pgbuf_Pool.volinfo_mutex);
/* Now, dump all buffer pages */
(void) fprintf (stdout,
" Buf Volid Pageid Fcnt LatchMode D A F Zone Lsa consistent Bufaddr Usrarea\n");
for (bufid = 0; bufid < pgbuf_Pool.num_buffers; bufid++)
{
bufptr = PGBUF_FIND_BCB_PTR (bufid);
PGBUF_BCB_LOCK (bufptr);
if (bufptr->fcnt > 0)
{
nfetched++;
}
if (pgbuf_bcb_is_dirty (bufptr))
{
ndirty++;
}
/* check if the content of current buffer page is consistent. */
consistent = pgbuf_is_consistent (bufptr, 0);
if (!pgbuf_bcb_is_dirty (bufptr) && get_fcnt (&bufptr->atomic_latch) == 0 && consistent != PGBUF_CONTENT_BAD)
{
PGBUF_BCB_UNLOCK (bufptr);
continue;
}
else
{
latch_mode_str = pgbuf_latch_mode_str (get_latch (&bufptr->atomic_latch));
zone_str = pgbuf_latch_mode_str (bufptr->zone);
consistenet_str = pgbuf_consistent_str (consistent);
fprintf (stdout, "%4d %5d %6d %4d %9s %1d %1d %1d %11s %lld|%4d %10s %p %p-%p\n",
pgbuf_bcb_get_pool_index (bufptr), VPID_AS_ARGS (&bufptr->vpid), get_fcnt (&bufptr->atomic_latch),
latch_mode_str, pgbuf_bcb_is_dirty (bufptr), (int) pgbuf_bcb_is_flushing (bufptr),
(int) pgbuf_bcb_is_async_flush_request (bufptr), zone_str,
LSA_AS_ARGS (&bufptr->iopage_buffer->iopage.prv.lsa), consistent_str, (void *) bufptr,
(void *) (&bufptr->iopage_buffer->iopage.page[0]),
(void *) (&bufptr->iopage_buffer->iopage.page[DB_PAGESIZE - 1]));
}
PGBUF_BCB_UNLOCK (bufptr);
}
(void) fprintf (stdout, "Number of fetched buffers = %d\nNumber of dirty buffers = %d\n", nfetched, ndirty);
}
/*
* pgbuf_is_consistent () - Check if a page is consistent
* return:
* bufptr(in): Pointer to buffer
* likely_bad_after_fixcnt(in): Don't tell me that he page is bad if
* fixcnt is greater that this
*
* Note: Consistency rule:
* If memory page is dirty, the content of page should be different to
* the content of the page on disk, otherwise, page is considered
* inconsistent. That is, someone set a page dirty without updating
* the page. This rule may fail since a page can be updated with the
* same content that the page on disk, however, this is a remote case.
*
* If memory page is not dirty, the content of page should be identical
* to the content of the page on disk, otherwise, page is considered
* inconsistent. This is the case that someone updates the page without
* setting it dirty.
*/
static int
pgbuf_is_consistent (const PGBUF_BCB * bufptr, int likely_bad_after_fixcnt)
{
int consistent = PGBUF_CONTENT_GOOD;
FILEIO_PAGE *malloc_io_pgptr;
bool is_page_corrupted;
/* the caller is holding bufptr->mutex */
if (memcmp (PGBUF_FIND_BUFFER_GUARD (bufptr), pgbuf_Guard, sizeof (pgbuf_Guard)) != 0)
{
er_log_debug (ARG_FILE_LINE, "SYSTEM ERROR buffer of pageid = %d|%d has been OVER RUN", bufptr->vpid.volid,
bufptr->vpid.pageid);
return PGBUF_CONTENT_BAD;
}
if (!VPID_ISNULL (&bufptr->vpid))
{
malloc_io_pgptr = (FILEIO_PAGE *) malloc (IO_PAGESIZE);
if (malloc_io_pgptr == NULL)
{
return consistent;
}
/* Read the disk page into local page area */
if (fileio_read (NULL, fileio_get_volume_descriptor (bufptr->vpid.volid), malloc_io_pgptr, bufptr->vpid.pageid,
IO_PAGESIZE) == NULL)
{
/* Unable to verify consistency of this page */
consistent = PGBUF_CONTENT_BAD;
}
else
{
/* If page is dirty, it should be different from the one on disk */
if (!LSA_EQ (&malloc_io_pgptr->prv.lsa, &bufptr->iopage_buffer->iopage.prv.lsa)
|| memcmp (malloc_io_pgptr->page, bufptr->iopage_buffer->iopage.page, DB_PAGESIZE) != 0)
{
consistent = (pgbuf_bcb_is_dirty (bufptr) ? PGBUF_CONTENT_GOOD : PGBUF_CONTENT_BAD);
/* If fix count is greater than likely_bad_after_fixcnt, the function cannot state that the page is bad */
if (consistent == PGBUF_CONTENT_BAD && get_fcnt (&bufptr->atomic_latch) > likely_bad_after_fixcnt)
{
consistent = PGBUF_CONTENT_LIKELY_BAD;
}
}
else
{
consistent = (pgbuf_bcb_is_dirty (bufptr) ? PGBUF_CONTENT_LIKELY_BAD : PGBUF_CONTENT_GOOD);
}
}
if (consistent != PGBUF_CONTENT_GOOD)
{
if (fileio_page_check_corruption (thread_get_thread_entry_info (), malloc_io_pgptr,
&is_page_corrupted) != NO_ERROR || is_page_corrupted)
{
consistent = PGBUF_CONTENT_BAD;
}
}
free_and_init (malloc_io_pgptr);
}
else
{
if (get_fcnt (&bufptr->atomic_latch) <= 0
&& pgbuf_get_check_page_validation_level (PGBUF_DEBUG_PAGE_VALIDATION_ALL))
{
int i;
/* The page should be scrambled, otherwise some one step on it */
for (i = 0; i < DB_PAGESIZE; i++)
{
if (bufptr->iopage_buffer->iopage.page[i] != MEM_REGION_SCRAMBLE_MARK)
{
/* The page has been stepped by someone */
consistent = PGBUF_CONTENT_BAD;
break;
}
}
}
}
/* The I/O executed for pgbuf_is_consistent is not recorded... */
return consistent;
}
#endif /* CUBRID_DEBUG */
#if !defined(NDEBUG)
static void
pgbuf_add_fixed_at (PGBUF_HOLDER * holder, const char *caller_file, int caller_line, bool reset)
{
char buf[256];
const char *p;
p = caller_file + strlen (caller_file);
while (p)
{
if (p == caller_file)
{
break;
}
if (*p == '/' || *p == '\\')
{
p++;
break;
}
p--;
}
if (reset)
{
sprintf (holder->fixed_at, "%s:%d ", p, caller_line);
holder->fixed_at_size = (int) strlen (holder->fixed_at);
}
else
{
sprintf (buf, "%s:%d ", p, caller_line);
if (strstr (holder->fixed_at, buf) == NULL)
{
strcat (holder->fixed_at, buf);
holder->fixed_at_size += (int) strlen (buf);
assert (holder->fixed_at_size < (64 * 1024));
}
}
return;
}
#endif /* NDEBUG */
#if defined(SERVER_MODE)
static void
pgbuf_sleep (THREAD_ENTRY * thread_p, pthread_mutex_t * mutex_p)
{
thread_lock_entry (thread_p);
pthread_mutex_unlock (mutex_p);
thread_suspend_wakeup_and_unlock_entry (thread_p, THREAD_PGBUF_SUSPENDED);
}
STATIC_INLINE int
pgbuf_wakeup (THREAD_ENTRY * thread_p)
{
int r = NO_ERROR;
if (thread_p->request_latch_mode != PGBUF_NO_LATCH)
{
thread_p->resume_status = THREAD_PGBUF_RESUMED;
r = pthread_cond_signal (&thread_p->wakeup_cond);
if (r != 0)
{
er_set_with_oserror (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_CSS_PTHREAD_COND_SIGNAL, 0);
thread_unlock_entry (thread_p);
return ER_CSS_PTHREAD_COND_SIGNAL;
}
}
else
{
er_log_debug (ARG_FILE_LINE, "thread_entry (%d, %ld) already timedout\n", thread_p->tran_index,
thread_p->get_posix_id ());
}
thread_unlock_entry (thread_p);
return r;
}
STATIC_INLINE int
pgbuf_wakeup_uncond (THREAD_ENTRY * thread_p)
{
int r;
thread_lock_entry (thread_p);
thread_p->resume_status = THREAD_PGBUF_RESUMED;
r = pthread_cond_signal (&thread_p->wakeup_cond);
if (r != 0)
{
er_set_with_oserror (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_CSS_PTHREAD_COND_SIGNAL, 0);
thread_unlock_entry (thread_p);
return ER_CSS_PTHREAD_COND_SIGNAL;
}
thread_unlock_entry (thread_p);
return r;
}
#endif /* SERVER_MODE */
STATIC_INLINE void
pgbuf_set_dirty_buffer_ptr (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr)
{
PGBUF_HOLDER *holder;
assert (bufptr != NULL);
pgbuf_bcb_set_dirty (thread_p, bufptr);
holder = pgbuf_find_thrd_holder (thread_p, bufptr);
assert (get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_WRITE);
assert (holder != NULL);
if (holder != NULL && holder->perf_stat.dirtied_by_holder == 0)
{
holder->perf_stat.dirtied_by_holder = 1;
}
/* Record number of dirties in statistics */
perfmon_inc_stat (thread_p, PSTAT_PB_NUM_DIRTIES);
}
/*
* pgbuf_wakeup_page_flush_daemon () - Wakeup the flushing daemon thread to flush some
* of the dirty pages in buffer pool to disk
* return : void
* thread_p (in) :
*/
static void
pgbuf_wakeup_page_flush_daemon (THREAD_ENTRY * thread_p)
{
#if defined (SERVER_MODE)
if (pgbuf_is_page_flush_daemon_available ())
{
pgbuf_Page_flush_daemon->wakeup ();
return;
}
#endif
PERF_UTIME_TRACKER dummy_time_tracker;
bool stop = false;
/* single-threaded environment. do flush on our own. */
dummy_time_tracker.is_perf_tracking = false;
pgbuf_flush_victim_candidates (thread_p, prm_get_float_value (PRM_ID_PB_BUFFER_FLUSH_RATIO), &dummy_time_tracker,
&stop);
assert (!stop);
}
/*
* pgbuf_has_perm_pages_fixed () -
*
* return : The number of pages fixed by the thread.
* thread_p (in) : Thread entry.
*
*/
bool
pgbuf_has_perm_pages_fixed (THREAD_ENTRY * thread_p)
{
int thrd_idx = thread_get_entry_index (thread_p);
PGBUF_HOLDER *holder = NULL;
if (pgbuf_Pool.thrd_holder_info[thrd_idx].num_hold_cnt == 0)
{
return false;
}
for (holder = pgbuf_Pool.thrd_holder_info[thrd_idx].thrd_hold_list; holder != NULL; holder = holder->thrd_link)
{
if (holder->bufptr->iopage_buffer->iopage.prv.ptype != PAGE_QRESULT)
{
return true;
}
}
return false;
}
#if defined (SERVER_MODE)
/*
* pgbuf_is_thread_high_priority () -
*
* return : true if the threads has any fixed pages and the other is waiting on any of them or
* it has an important hot page such as volume header, file header, index root and heap header.
* thread_p (in) : Thread entry.
*/
static bool
pgbuf_is_thread_high_priority (THREAD_ENTRY * thread_p)
{
int thrd_idx = thread_get_entry_index (thread_p);
PGBUF_HOLDER *holder = NULL;
if (pgbuf_Pool.thrd_holder_info[thrd_idx].num_hold_cnt == 0)
{
/* not owns any page */
return false;
}
for (holder = pgbuf_Pool.thrd_holder_info[thrd_idx].thrd_hold_list; holder != NULL; holder = holder->thrd_link)
{
if (holder->bufptr->next_wait_thrd != NULL)
{
/* someone is waiting for the thread */
return true;
}
if (holder->bufptr->iopage_buffer->iopage.prv.ptype == PAGE_VOLHEADER)
{
/* has volume header */
return true;
}
if (holder->bufptr->iopage_buffer->iopage.prv.ptype == PAGE_FTAB)
{
/* holds a file header page */
return true;
}
if (holder->bufptr->iopage_buffer->iopage.prv.ptype == PAGE_BTREE
&& (btree_get_perf_btree_page_type (thread_p, holder->bufptr->iopage_buffer->iopage.page)
== PERF_PAGE_BTREE_ROOT))
{
/* holds b-tree root */
return true;
}
if (holder->bufptr->iopage_buffer->iopage.prv.ptype == PAGE_HEAP
&& heap_is_page_header (thread_p, holder->bufptr->iopage_buffer->iopage.page))
{
/* heap file header */
return true;
}
}
return false;
}
#endif /* SERVER_MODE */
enum
{
NEIGHBOR_ABORT_RANGE = 1,
NEIGHBOR_ABORT_NOTFOUND_NONDIRTY_BACK,
NEIGHBOR_ABORT_NOTFOUND_DIRTY_BACK,
NEIGHBOR_ABORT_LATCH_NONDIRTY_BACK,
NEIGHBOR_ABORT_LATCH_DIRTY_BACK,
NEIGHBOR_ABORT_NONDIRTY_NOT_ALLOWED,
NEIGHBOR_ABORT_TWO_CONSECTIVE_NONDIRTIES,
NEIGHBOR_ABORT_TOO_MANY_NONDIRTIES
};
/*
* pgbuf_flush_page_and_neighbors_fb () - Flush page pointed to by the supplied BCB and also flush neighbor pages
*
* return : error code or NO_ERROR
* thread_p (in) : thread entry
* bufptr (in) : BCB to flush
* flushed_pages(out): actual number of flushed pages
*
* todo: too big to be inlined. maybe we can optimize it.
*/
static int
pgbuf_flush_page_and_neighbors_fb (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, int *flushed_pages)
{
#define PGBUF_PAGES_COUNT_THRESHOLD 4
int error = NO_ERROR, i;
int save_first_error = NO_ERROR;
LOG_LSA log_newest_oldest_unflush_lsa;
VPID first_vpid, vpid;
PGBUF_BUFFER_HASH *hash_anchor;
PGBUF_BATCH_FLUSH_HELPER *helper = &pgbuf_Flush_helper;
bool prev_page_dirty = true;
int dirty_pages_cnt = 0;
int pos;
bool forward;
bool search_nondirty;
int written_pages;
int abort_reason;
bool was_page_flushed = false;
#if defined(ENABLE_SYSTEMTAP)
QUERY_ID query_id = -1;
bool monitored = false;
#endif /* ENABLE_SYSTEMTAP */
#if defined(ENABLE_SYSTEMTAP)
query_id = qmgr_get_current_query_id (thread_p);
if (query_id != NULL_QUERY_ID)
{
monitored = true;
CUBRID_IO_WRITE_START (query_id);
}
#endif /* ENABLE_SYSTEMTAP */
/* init */
helper->npages = 0;
helper->fwd_offset = 0;
helper->back_offset = 0;
/* add bufptr as middle page */
pgbuf_add_bufptr_to_batch (bufptr, 0);
VPID_COPY (&first_vpid, &bufptr->vpid);
LSA_COPY (&log_newest_oldest_unflush_lsa, &bufptr->oldest_unflush_lsa);
PGBUF_BCB_UNLOCK (bufptr);
VPID_COPY (&vpid, &first_vpid);
/* Now search around bufptr->vpid for neighbors. */
forward = true;
search_nondirty = false;
abort_reason = 0;
for (i = 1; i < PGBUF_NEIGHBOR_PAGES;)
{
if (forward == true)
{
if (first_vpid.pageid <= PAGEID_MAX - (helper->fwd_offset + 1))
{
vpid.pageid = first_vpid.pageid + helper->fwd_offset + 1;
}
else
{
abort_reason = NEIGHBOR_ABORT_RANGE;
break;
}
}
else
{
if (first_vpid.pageid >= helper->back_offset + 1)
{
vpid.pageid = first_vpid.pageid - helper->back_offset - 1;
}
else if (PGBUF_NEIGHBOR_FLUSH_NONDIRTY == false || search_nondirty == true)
{
abort_reason = NEIGHBOR_ABORT_RANGE;
break;
}
else
{
search_nondirty = true;
forward = true;
continue;
}
}
hash_anchor = &pgbuf_Pool.buf_hash_table[PGBUF_HASH_VALUE (&vpid)];
bufptr = pgbuf_search_hash_chain (thread_p, hash_anchor, &vpid);
if (bufptr == NULL)
{
/* Page not found: change direction or abandon batch */
pthread_mutex_unlock (&hash_anchor->hash_mutex);
if (search_nondirty == true)
{
if (forward == false)
{
abort_reason = NEIGHBOR_ABORT_NOTFOUND_NONDIRTY_BACK;
break;
}
else
{
forward = false;
continue;
}
}
else
{
if (forward == true)
{
forward = false;
continue;
}
else if (PGBUF_NEIGHBOR_FLUSH_NONDIRTY == true)
{
search_nondirty = true;
forward = true;
continue;
}
else
{
abort_reason = NEIGHBOR_ABORT_NOTFOUND_DIRTY_BACK;
break;
}
}
}
/* Abandon batch for: fixed pages, latched pages or with 'avoid_victim' */
if (pgbuf_bcb_is_flushing (bufptr) || get_latch (&bufptr->atomic_latch) > PGBUF_LATCH_READ)
{
PGBUF_BCB_UNLOCK (bufptr);
if (search_nondirty == true)
{
if (forward == false)
{
abort_reason = NEIGHBOR_ABORT_LATCH_NONDIRTY_BACK;
break;
}
else
{
forward = false;
continue;
}
}
else
{
if (forward == true)
{
forward = false;
continue;
}
else if (PGBUF_NEIGHBOR_FLUSH_NONDIRTY == true)
{
search_nondirty = true;
forward = true;
continue;
}
else
{
abort_reason = NEIGHBOR_ABORT_LATCH_DIRTY_BACK;
break;
}
}
}
if (!pgbuf_bcb_is_dirty (bufptr))
{
if (search_nondirty == false)
{
PGBUF_BCB_UNLOCK (bufptr);
if (forward == true)
{
forward = false;
continue;
}
else if (PGBUF_NEIGHBOR_FLUSH_NONDIRTY == true)
{
search_nondirty = true;
forward = true;
continue;
}
abort_reason = NEIGHBOR_ABORT_NONDIRTY_NOT_ALLOWED;
break;
}
if (prev_page_dirty == false)
{
/* two consecutive non-dirty pages */
PGBUF_BCB_UNLOCK (bufptr);
abort_reason = NEIGHBOR_ABORT_TWO_CONSECTIVE_NONDIRTIES;
break;
}
}
else
{
if (LSA_LT (&log_newest_oldest_unflush_lsa, &bufptr->oldest_unflush_lsa))
{
LSA_COPY (&log_newest_oldest_unflush_lsa, &bufptr->oldest_unflush_lsa);
}
dirty_pages_cnt++;
}
if (helper->npages > PGBUF_PAGES_COUNT_THRESHOLD && ((2 * dirty_pages_cnt) < helper->npages))
{
/* too many non dirty pages */
PGBUF_BCB_UNLOCK (bufptr);
helper->npages = 1;
abort_reason = NEIGHBOR_ABORT_TOO_MANY_NONDIRTIES;
break;
}
prev_page_dirty = pgbuf_bcb_is_dirty (bufptr);
/* add bufptr to batch */
pgbuf_add_bufptr_to_batch (bufptr, vpid.pageid - first_vpid.pageid);
PGBUF_BCB_UNLOCK (bufptr);
i++;
}
if (prev_page_dirty == true)
{
if (helper->fwd_offset > 0 && !pgbuf_bcb_is_dirty (helper->pages_bufptr[PGBUF_NEIGHBOR_POS (helper->fwd_offset)]))
{
helper->fwd_offset--;
helper->npages--;
}
if (helper->back_offset > 0
&& !pgbuf_bcb_is_dirty (helper->pages_bufptr[PGBUF_NEIGHBOR_POS (-helper->back_offset)]))
{
helper->back_offset--;
helper->npages--;
}
}
if (helper->npages <= 1)
{
/* flush only first page */
pos = PGBUF_NEIGHBOR_POS (0);
bufptr = helper->pages_bufptr[pos];
error = pgbuf_flush_neighbor_safe (thread_p, bufptr, &helper->vpids[pos], &was_page_flushed);
if (error != NO_ERROR)
{
ASSERT_ERROR ();
return error;
}
if (was_page_flushed)
{
*flushed_pages = 1;
}
return NO_ERROR;
}
/* WAL protocol: force log record to disk */
logpb_flush_log_for_wal (thread_p, &log_newest_oldest_unflush_lsa);
written_pages = 0;
for (pos = PGBUF_NEIGHBOR_POS (-helper->back_offset); pos <= PGBUF_NEIGHBOR_POS (helper->fwd_offset); pos++)
{
bufptr = helper->pages_bufptr[pos];
error = pgbuf_flush_neighbor_safe (thread_p, bufptr, &helper->vpids[pos], &was_page_flushed);
if (error != NO_ERROR)
{
ASSERT_ERROR ();
if (save_first_error == NO_ERROR)
{
save_first_error = error;
}
continue;
}
if (was_page_flushed)
{
written_pages++;
}
}
if (prm_get_bool_value (PRM_ID_LOG_PGBUF_VICTIM_FLUSH))
{
_er_log_debug (ARG_FILE_LINE,
"pgbuf_flush_page_and_neighbors_fb: collected_pages:%d, written:%d, back_offset:%d, fwd_offset%d, "
"abort_reason:%d", helper->npages, written_pages, helper->back_offset, helper->fwd_offset,
abort_reason);
}
*flushed_pages = written_pages;
helper->npages = 0;
return save_first_error;
#undef PGBUF_PAGES_COUNT_THRESHOLD
}
/*
* pgbuf_add_bufptr_to_batch () - Add a page to the flush helper
* return : void
* bufptr (in) : BCB of page to add
*/
STATIC_INLINE void
pgbuf_add_bufptr_to_batch (PGBUF_BCB * bufptr, int idx)
{
PGBUF_BATCH_FLUSH_HELPER *helper = &pgbuf_Flush_helper;
int pos;
assert (get_latch (&bufptr->atomic_latch) == PGBUF_NO_LATCH || get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_READ
|| get_latch (&bufptr->atomic_latch) == PGBUF_LATCH_WRITE);
assert (idx > -PGBUF_NEIGHBOR_PAGES && idx < PGBUF_NEIGHBOR_PAGES);
pos = PGBUF_NEIGHBOR_POS (idx);
VPID_COPY (&helper->vpids[pos], &bufptr->vpid);
helper->pages_bufptr[pos] = bufptr;
helper->npages++;
if (idx > 0)
{
helper->fwd_offset++;
}
else if (idx < 0)
{
helper->back_offset++;
}
}
/*
* pgbuf_flush_neighbor_safe () - Flush collected page for neighbor flush if it's safe:
* 1. VPID of bufptr has not changed.
* 2. Page has no latch or is only latched for read.
*
* return : Error code.
* thread_p (in) : Thread entry.
* bufptr (in) : Buffered page collected for neighbor flush.
* expected_vpid (in) : Expected VPID for bufptr.
* flushed (out) : Output true if page was flushed.
*/
STATIC_INLINE int
pgbuf_flush_neighbor_safe (THREAD_ENTRY * thread_p, PGBUF_BCB * bufptr, VPID * expected_vpid, bool * flushed)
{
int error = NO_ERROR;
bool is_bcb_locked = true;
assert (bufptr != NULL);
assert (expected_vpid != NULL && !VPID_ISNULL (expected_vpid));
assert (flushed != NULL);
*flushed = false;
PGBUF_BCB_LOCK (bufptr);
if (!VPID_EQ (&bufptr->vpid, expected_vpid))
{
PGBUF_BCB_UNLOCK (bufptr);
return NO_ERROR;
}
if (pgbuf_bcb_is_flushing (bufptr) || get_latch (&bufptr->atomic_latch) > PGBUF_LATCH_READ)
{
PGBUF_BCB_UNLOCK (bufptr);
return NO_ERROR;
}
/* flush even if it is not dirty. todo: is this necessary? */
error = pgbuf_bcb_flush_with_wal (thread_p, bufptr, true, &is_bcb_locked);
if (is_bcb_locked)
{
PGBUF_BCB_UNLOCK (bufptr);
}
if (error == NO_ERROR)
{
*flushed = true;
}
else
{
ASSERT_ERROR ();
}
return error;
}
/*
* pgbuf_compare_hold_vpid_for_sort () - Compare the vpid for sort
* return: p1 - p2
* p1(in): victim candidate list 1
* p2(in): victim candidate list 2
*/
static int
pgbuf_compare_hold_vpid_for_sort (const void *p1, const void *p2)
{
PGBUF_HOLDER_INFO *h1, *h2;
int diff;
h1 = (PGBUF_HOLDER_INFO *) p1;
h2 = (PGBUF_HOLDER_INFO *) p2;
if (h1 == h2)
{
return 0;
}
/* Pages with NULL GROUP sort last */
if (VPID_ISNULL (&h1->group_id) && !VPID_ISNULL (&h2->group_id))
{
return 1;
}
else if (!VPID_ISNULL (&h1->group_id) && VPID_ISNULL (&h2->group_id))
{
return -1;
}
diff = h1->group_id.volid - h2->group_id.volid;
if (diff != 0)
{
return diff;
}
diff = h1->group_id.pageid - h2->group_id.pageid;
if (diff != 0)
{
return diff;
}
diff = h1->rank - h2->rank;
if (diff != 0)
{
return diff;
}
diff = h1->vpid.volid - h2->vpid.volid;
if (diff != 0)
{
return diff;
}
diff = h1->vpid.pageid - h2->vpid.pageid;
if (diff != 0)
{
return diff;
}
return diff;
}
/*
* pgbuf_ordered_fix () - Fix page in VPID order; other previously fixed pages may be unfixed and re-fixed again.
* return: error code
* thread_p(in):
* req_vpid(in):
* fetch_mode(in): old or new page
* request_mode(in): latch mode
* req_watcher(in/out): page watcher object, also holds output page pointer
*
* Note: If fails to re-fix previously fixed pages (unfixed with this request), the requested page is unfixed
* (if fixed) and error is returned. In such case, older some pages may be re-fixed, other not : the caller
* should check page pointer of watchers before using them in case of error.
*
* Note2: If any page re-fix occurs for previously fixed pages, their 'unfix' flag in their watcher is set.
* (caller is responsible to check this flag)
*
*/
#if !defined(NDEBUG)
int
pgbuf_ordered_fix_debug (THREAD_ENTRY * thread_p, const VPID * req_vpid, PAGE_FETCH_MODE fetch_mode,
const PGBUF_LATCH_MODE request_mode, PGBUF_WATCHER * req_watcher, const char *caller_file,
int caller_line, const char *caller_func)
#else /* NDEBUG */
int
pgbuf_ordered_fix_release (THREAD_ENTRY * thread_p, const VPID * req_vpid, PAGE_FETCH_MODE fetch_mode,
const PGBUF_LATCH_MODE request_mode, PGBUF_WATCHER * req_watcher)
#endif /* NDEBUG */
{
int er_status = NO_ERROR;
PGBUF_HOLDER *holder, *next_holder;
PAGE_PTR pgptr, ret_pgptr;
int i, thrd_idx;
int saved_pages_cnt = 0;
PGBUF_LATCH_MODE curr_request_mode;
PAGE_FETCH_MODE curr_fetch_mode;
PGBUF_HOLDER_INFO ordered_holders_info[PGBUF_MAX_PAGE_FIXED_BY_TRAN];
PGBUF_HOLDER_INFO req_page_holder_info;
bool req_page_has_watcher;
bool req_page_has_group = false;
int er_status_get_hfid = NO_ERROR;
VPID req_page_groupid;
bool has_dealloc_prevent_flag = false;
PGBUF_LATCH_CONDITION latch_condition;
PGBUF_BCB *bufptr = NULL;
#if defined(PGBUF_ORDERED_DEBUG)
static unsigned int global_ordered_fix_id = 0;
unsigned int ordered_fix_id;
#endif
assert (req_watcher != NULL);
#if defined(PGBUF_ORDERED_DEBUG)
ordered_fix_id = global_ordered_fix_id++;
#endif
#if !defined(NDEBUG)
assert (req_watcher->magic == PGBUF_WATCHER_MAGIC_NUMBER);
#endif
ret_pgptr = NULL;
req_page_has_watcher = false;
if (req_watcher->pgptr != NULL)
{
assert_release (false);
er_status = ER_FAILED_ASSERTION;
goto exit;
}
/* set or promote current page rank */
if (VPID_EQ (&req_watcher->group_id, req_vpid))
{
req_watcher->curr_rank = PGBUF_ORDERED_HEAP_HDR;
}
else
{
req_watcher->curr_rank = req_watcher->initial_rank;
}
req_page_has_group = VPID_ISNULL (&req_watcher->group_id) ? false : true;
if (req_page_has_group == false)
{
VPID_SET_NULL (&req_page_groupid);
}
VPID_COPY (&req_page_holder_info.group_id, &req_watcher->group_id);
req_page_holder_info.rank = req_watcher->curr_rank;
VPID_COPY (&req_page_holder_info.vpid, req_vpid);
req_page_holder_info.watch_count = 1;
req_page_holder_info.watcher[0] = req_watcher;
thrd_idx = thread_get_entry_index (thread_p);
holder = pgbuf_Pool.thrd_holder_info[thrd_idx].thrd_hold_list;
if ((holder == NULL) || ((holder->thrd_link == NULL) && (VPID_EQ (req_vpid, &(holder->bufptr->vpid)))))
{
/* There are no other fixed pages or only the requested page was already fixed */
latch_condition = PGBUF_UNCONDITIONAL_LATCH;
}
else
{
latch_condition = PGBUF_CONDITIONAL_LATCH;
}
#if !defined(NDEBUG)
ret_pgptr =
pgbuf_fix_debug (thread_p, req_vpid, fetch_mode, request_mode, latch_condition, caller_file, caller_line,
caller_func);
#else
ret_pgptr = pgbuf_fix_release (thread_p, req_vpid, fetch_mode, request_mode, latch_condition);
#endif
if (ret_pgptr != NULL)
{
for (holder = pgbuf_Pool.thrd_holder_info[thrd_idx].thrd_hold_list; holder != NULL; holder = holder->thrd_link)
{
CAST_BFPTR_TO_PGPTR (ret_pgptr, holder->bufptr);
if (VPID_EQ (req_vpid, &(holder->bufptr->vpid)))
{
assert (PGBUF_IS_ORDERED_PAGETYPE (holder->bufptr->iopage_buffer->iopage.prv.ptype));
if (req_page_has_group == false && holder->first_watcher != NULL)
{
/* special case : already have fix on this page with an watcher; get group id from existing watcher */
assert (holder->watch_count > 0);
assert (!VPID_ISNULL (&holder->first_watcher->group_id));
VPID_COPY (&req_watcher->group_id, &holder->first_watcher->group_id);
}
else if (req_page_has_group == false && pgbuf_get_page_ptype (thread_p, ret_pgptr) == PAGE_HEAP)
{
er_status = pgbuf_get_groupid_and_unfix (thread_p, req_vpid, &ret_pgptr, &req_page_groupid, false);
if (er_status != NO_ERROR)
{
er_status_get_hfid = er_status;
goto exit;
}
assert (!VPID_ISNULL (&req_page_groupid));
VPID_COPY (&req_watcher->group_id, &req_page_groupid);
}
#if !defined(NDEBUG)
pgbuf_add_watch_instance_internal (holder, ret_pgptr, req_watcher, request_mode, true, caller_file,
caller_line);
#else
pgbuf_add_watch_instance_internal (holder, ret_pgptr, req_watcher, request_mode, true);
#endif
req_page_has_watcher = true;
goto exit;
}
}
assert_release (false);
er_status = ER_FAILED_ASSERTION;
goto exit;
}
else
{
int wait_msecs;
assert (ret_pgptr == NULL);
er_status = er_errid_if_has_error ();
if (er_status == ER_PB_BAD_PAGEID || er_status == ER_INTERRUPTED)
{
goto exit;
}
/* OLD_PAGE_MAYBE_DEALLOCATED sets ER_WARNING_SEVERITY for ER_PB_BAD_PAGEID,
* which er_errid_if_has_error() does not catch; handle it explicitly here. */
if (fetch_mode == OLD_PAGE_MAYBE_DEALLOCATED && er_errid () == ER_PB_BAD_PAGEID)
{
er_status = ER_PB_BAD_PAGEID;
goto exit;
}
wait_msecs = pgbuf_find_current_wait_msecs (thread_p);
if (wait_msecs == LK_ZERO_WAIT || wait_msecs == LK_FORCE_ZERO_WAIT)
{
/* attempts to unfix-refix old page may fail since CONDITIONAL latch will be enforced; just return page
* cannot be fixed */
if (er_status == NO_ERROR)
{
/* LK_FORCE_ZERO_WAIT is used in some page scan functions (e.g. heap_stats_find_page_in_bestspace) to
* skip busy pages; here we return an error code (which means the page was not fixed), however no error
* is set : this allows scan of pages to continue */
assert (wait_msecs == LK_FORCE_ZERO_WAIT);
er_status = ER_LK_PAGE_TIMEOUT;
}
goto exit;
}
if (latch_condition == PGBUF_UNCONDITIONAL_LATCH)
{
/* continue */
er_status = er_errid ();
if (er_status == NO_ERROR)
{
er_status = ER_FAILED;
}
goto exit;
}
/* to proceed ordered fix the pages, forget any underlying error. */
er_status = NO_ERROR;
}
if (fetch_mode == OLD_PAGE_PREVENT_DEALLOC)
{
has_dealloc_prevent_flag = true;
fetch_mode = OLD_PAGE;
}
holder = pgbuf_Pool.thrd_holder_info[thrd_idx].thrd_hold_list;
while (holder != NULL)
{
next_holder = holder->thrd_link;
if (holder->watch_count <= 0)
{
/* cannot perform unfix-ordered fix without watcher; we assume that this holder's page will not trigger a
* latch deadlock and ignore it */
holder = next_holder;
continue;
}
assert (PGBUF_IS_ORDERED_PAGETYPE (holder->bufptr->iopage_buffer->iopage.prv.ptype));
if (saved_pages_cnt >= PGBUF_MAX_PAGE_FIXED_BY_TRAN)
{
assert_release (false);
er_status = ER_FAILED_ASSERTION;
goto exit;
}
else if (VPID_EQ (req_vpid, &(holder->bufptr->vpid)))
{
/* already have a fix on this page, should not be here */
if (pgbuf_is_valid_page (thread_p, req_vpid, false) != DISK_VALID)
{
#if defined(PGBUF_ORDERED_DEBUG)
_er_log_debug (__FILE__, __LINE__,
"ORDERED_FIX(%u): page VPID:(%d,%d) (GROUP:%d,%d; rank:%d/%d) "
"invalid, while having holder: %X ", ordered_fix_id, req_vpid->volid, req_vpid->pageid,
req_watcher->group_id.volid, req_watcher->group_id.pageid, req_watcher->curr_rank,
req_watcher->initial_rank, holder);
#endif
er_status = er_errid ();
}
else
{
er_status = ER_FAILED_ASSERTION;
}
assert_release (false);
goto exit;
}
else
{
int holder_fix_cnt;
int j, diff;
PAGE_PTR save_page_ptr = NULL;
PGBUF_WATCHER *pg_watcher;
int page_rank;
PGBUF_ORDERED_GROUP group_id;
page_rank = PGBUF_ORDERED_RANK_UNDEFINED;
VPID_SET_NULL (&group_id);
holder_fix_cnt = holder->fix_count;
if (holder_fix_cnt != holder->watch_count)
{
/* this page was fixed without watcher, without being unfixed before another page fix ; we do not allow
* this */
assert_release (false);
er_status = ER_FAILED_ASSERTION;
goto exit;
}
assert (holder->watch_count < PGBUF_MAX_PAGE_WATCHERS);
ordered_holders_info[saved_pages_cnt].latch_mode = PGBUF_LATCH_READ;
pg_watcher = holder->first_watcher;
j = 0;
ordered_holders_info[saved_pages_cnt].prevent_dealloc = false;
/* add all watchers */
while (pg_watcher != NULL)
{
#if !defined(NDEBUG)
CAST_BFPTR_TO_PGPTR (pgptr, holder->bufptr);
assert (pg_watcher->magic == PGBUF_WATCHER_MAGIC_NUMBER);
assert (pg_watcher->pgptr == pgptr);
assert (pg_watcher->curr_rank < PGBUF_ORDERED_RANK_UNDEFINED);
assert (!VPID_ISNULL (&pg_watcher->group_id));
#endif
if (page_rank == PGBUF_ORDERED_RANK_UNDEFINED)
{
page_rank = pg_watcher->curr_rank;
}
else if (page_rank != pg_watcher->curr_rank)
{
/* all watchers on this page should have the same rank */
char additional_msg[128];
snprintf (additional_msg, sizeof (additional_msg) - 1, "different page ranks:%d,%d", page_rank,
pg_watcher->curr_rank);
er_status = ER_PB_ORDERED_INCONSISTENCY;
er_set (ER_FATAL_ERROR_SEVERITY, ARG_FILE_LINE, er_status, 5, req_vpid->volid, req_vpid->pageid,
holder->bufptr->vpid.volid, holder->bufptr->vpid.pageid, additional_msg);
goto exit;
}
if (VPID_ISNULL (&group_id))
{
VPID_COPY (&group_id, &pg_watcher->group_id);
}
else if (!VPID_EQ (&group_id, &pg_watcher->group_id))
{
char additional_msg[128];
snprintf (additional_msg, sizeof (additional_msg) - 1, "different GROUP_ID : (%d,%d) and (%d,%d)",
group_id.volid, group_id.pageid, pg_watcher->group_id.volid, pg_watcher->group_id.pageid);
/* all watchers on this page should have the same group id */
er_status = ER_PB_ORDERED_INCONSISTENCY;
er_set (ER_FATAL_ERROR_SEVERITY, ARG_FILE_LINE, er_status, 5, req_vpid->volid, req_vpid->pageid,
holder->bufptr->vpid.volid, holder->bufptr->vpid.pageid, additional_msg);
goto exit;
}
if (save_page_ptr == NULL)
{
save_page_ptr = pg_watcher->pgptr;
}
else
{
assert (save_page_ptr == pg_watcher->pgptr);
}
ordered_holders_info[saved_pages_cnt].watcher[j] = pg_watcher;
if (pg_watcher->latch_mode == PGBUF_LATCH_WRITE)
{
ordered_holders_info[saved_pages_cnt].latch_mode = PGBUF_LATCH_WRITE;
}
j++;
#if defined(PGBUF_ORDERED_DEBUG)
_er_log_debug (__FILE__, __LINE__,
"ordered_fix(%u): check_watcher: pgptr:%X, VPID:(%d,%d), GROUP:%d,%d, rank:%d/%d, "
"holder_fix_count:%d, holder_watch_count:%d, holder_fixed_at:%s", ordered_fix_id,
pg_watcher->pgptr, holder->bufptr->vpid.volid, holder->bufptr->vpid.pageid,
pg_watcher->group_id.volid, pg_watcher->group_id.pageid, pg_watcher->curr_rank,
pg_watcher->initial_rank, holder->fix_count, holder->watch_count, holder->fixed_at);
#endif
pg_watcher = pg_watcher->next;
}
assert (j == holder->watch_count);
VPID_COPY (&ordered_holders_info[saved_pages_cnt].group_id, &group_id);
ordered_holders_info[saved_pages_cnt].rank = page_rank;
VPID_COPY (&(ordered_holders_info[saved_pages_cnt].vpid), &(holder->bufptr->vpid));
if (req_page_has_group == true)
{
diff = pgbuf_compare_hold_vpid_for_sort (&req_page_holder_info, &ordered_holders_info[saved_pages_cnt]);
}
else
{
/* page needs to be unfixed */
diff = -1;
}
if (diff < 0)
{
ordered_holders_info[saved_pages_cnt].watch_count = holder->watch_count;
ordered_holders_info[saved_pages_cnt].ptype = (PAGE_TYPE) holder->bufptr->iopage_buffer->iopage.prv.ptype;
#if defined(PGBUF_ORDERED_DEBUG)
_er_log_debug (__FILE__, __LINE__,
"ordered_fix(%u): save_watchers (%d): pgptr:%X, VPID:(%d,%d), "
"GROUP:(%d,%d), rank:%d(page_rank:%d), holder_fix_count:%d, holder_watch_count:%d",
ordered_fix_id, ordered_holders_info[saved_pages_cnt].watch_count, save_page_ptr,
ordered_holders_info[saved_pages_cnt].vpid.volid,
ordered_holders_info[saved_pages_cnt].vpid.pageid,
ordered_holders_info[saved_pages_cnt].group_id.volid,
ordered_holders_info[saved_pages_cnt].group_id.pageid,
ordered_holders_info[saved_pages_cnt].rank, page_rank, holder_fix_cnt,
holder->watch_count);
#endif
saved_pages_cnt++;
}
else if (diff == 0)
{
assert_release (false);
er_status = ER_FAILED_ASSERTION;
goto exit;
}
else
{
assert (diff > 0);
/* this page is correctly fixed before new requested page, the accumulated watchers are just ignored */
#if defined(PGBUF_ORDERED_DEBUG)
_er_log_debug (__FILE__, __LINE__,
"ordered_fix(%u): ignore: pgptr:%X, VPID:(%d,%d) "
"GROUP:(%d,%d), rank:%d --- ignored", ordered_fix_id, save_page_ptr,
ordered_holders_info[saved_pages_cnt].vpid.volid,
ordered_holders_info[saved_pages_cnt].vpid.pageid,
ordered_holders_info[saved_pages_cnt].group_id.volid,
ordered_holders_info[saved_pages_cnt].group_id.pageid,
ordered_holders_info[saved_pages_cnt].rank);
#endif
}
}
holder = next_holder;
}
holder = pgbuf_Pool.thrd_holder_info[thrd_idx].thrd_hold_list;
/* unfix pages which do not fulfill the VPID order */
for (i = 0; i < saved_pages_cnt; i++)
{
int j, holder_fix_cnt;
#if defined(PGBUF_ORDERED_DEBUG)
int holder_fix_cnt_save;
#endif
while (holder != NULL && !VPID_EQ (&(ordered_holders_info[i].vpid), &(holder->bufptr->vpid)))
{
holder = holder->thrd_link;
}
if (holder == NULL)
{
assert_release (false);
er_status = ER_FAILED_ASSERTION;
goto exit;
}
next_holder = holder->thrd_link;
/* not necessary to remove each watcher since the holder will be removed completely */
holder->watch_count = 0;
holder->first_watcher = NULL;
holder->last_watcher = NULL;
holder_fix_cnt = holder->fix_count;
#if defined(PGBUF_ORDERED_DEBUG)
holder_fix_cnt_save = holder_fix_cnt;
#endif
CAST_BFPTR_TO_PGPTR (pgptr, holder->bufptr);
assert (holder_fix_cnt > 0);
/* prevent deallocate. */
pgbuf_bcb_register_avoid_deallocation (holder->bufptr);
ordered_holders_info[i].prevent_dealloc = true;
while (holder_fix_cnt-- > 0)
{
pgbuf_unfix (thread_p, pgptr);
}
for (j = 0; j < ordered_holders_info[i].watch_count; j++)
{
PGBUF_WATCHER *pg_watcher;
pg_watcher = ordered_holders_info[i].watcher[j];
assert (pg_watcher->pgptr == pgptr);
assert (pg_watcher->curr_rank < PGBUF_ORDERED_RANK_UNDEFINED);
#if defined(PGBUF_ORDERED_DEBUG)
_er_log_debug (__FILE__, __LINE__,
"ordered_fix(%u): unfix & clear_watcher(%d/%d): pgptr:%X, VPID:(%d,%d), GROUP:%d,%d, "
"rank:%d/%d, latch_mode:%d, holder_fix_cnt:%d", ordered_fix_id, j + 1,
ordered_holders_info[i].watch_count, pg_watcher->pgptr, ordered_holders_info[i].vpid.volid,
ordered_holders_info[i].vpid.pageid, pg_watcher->group_id.volid, pg_watcher->group_id.pageid,
pg_watcher->curr_rank, pg_watcher->initial_rank, pg_watcher->latch_mode, holder_fix_cnt_save);
#endif
PGBUF_CLEAR_WATCHER (pg_watcher);
pg_watcher->page_was_unfixed = true;
#if !defined(NDEBUG)
pgbuf_watcher_init_debug (pg_watcher, caller_file, caller_line, true);
#endif
}
holder = next_holder;
}
/* the following code assumes that if the class OID is deleted, after the requested page is unlatched, the HFID page
* is not reassigned to an ordinary page; in such case, a page deadlock may occur in worst case. Example of scenario
* when such situation may occur : We assume an existing latch on VPID1 (0, 90) 1. Fix requested page VPID2 (0, 100),
* get class_oid from page 2. Unfix requested page 3. Get HFID from schema : < between 2 and 3, other threads drop
* the class, and HFID page is reused, along with current page which may be allocated to the HFID of another class >
* 4. Still assuming that HFID is valid, this thread starts latching pages: In order VPID1, VPID2 At the same time,
* another thread, starts latching pages VPID1 and VPID2, but since this later thread knows that VPID2 is a HFID,
* will use the order VPID2, VPID1. */
if (req_page_has_group == false)
{
#if !defined(NDEBUG)
/* all previous pages with watcher have been unfixed */
holder = pgbuf_Pool.thrd_holder_info[thrd_idx].thrd_hold_list;
while (holder != NULL)
{
assert (holder->watch_count == 0);
holder = holder->thrd_link;
}
pgptr =
pgbuf_fix_debug (thread_p, req_vpid, fetch_mode, request_mode, PGBUF_UNCONDITIONAL_LATCH, caller_file,
caller_line, caller_func);
#else
pgptr = pgbuf_fix_release (thread_p, req_vpid, fetch_mode, request_mode, PGBUF_UNCONDITIONAL_LATCH);
#endif
if (pgptr != NULL)
{
if (has_dealloc_prevent_flag == true)
{
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
pgbuf_bcb_unregister_avoid_deallocation (bufptr);
has_dealloc_prevent_flag = false;
}
if (pgbuf_get_page_ptype (thread_p, pgptr) == PAGE_HEAP)
{
er_status = pgbuf_get_groupid_and_unfix (thread_p, req_vpid, &pgptr, &req_page_groupid, true);
if (er_status != NO_ERROR)
{
er_status_get_hfid = er_status;
/* continue (re-latch old pages) */
}
}
}
else
{
/* continue */
er_status_get_hfid = er_errid ();
if (er_status_get_hfid == NO_ERROR)
{
er_status_get_hfid = ER_FAILED;
}
}
}
#if defined(PGBUF_ORDERED_DEBUG)
_er_log_debug (__FILE__, __LINE__,
"ordered_fix(%u) : restore_pages: %d, req_VPID(%d,%d), GROUP(%d,%d), rank:%d/%d", ordered_fix_id,
saved_pages_cnt, req_vpid->volid, req_vpid->pageid, req_watcher->group_id.volid,
req_watcher->group_id.pageid, req_watcher->curr_rank, req_watcher->initial_rank);
#endif
/* add requested page, watch instance is added after page is fixed */
if (req_page_has_group == true || er_status_get_hfid == NO_ERROR)
{
if (req_page_has_group)
{
VPID_COPY (&(ordered_holders_info[saved_pages_cnt].group_id), &req_watcher->group_id);
}
else
{
assert (!VPID_ISNULL (&req_page_groupid));
VPID_COPY (&req_watcher->group_id, &req_page_groupid);
VPID_COPY (&(ordered_holders_info[saved_pages_cnt].group_id), &req_page_groupid);
}
VPID_COPY (&(ordered_holders_info[saved_pages_cnt].vpid), req_vpid);
if (req_page_has_group)
{
ordered_holders_info[saved_pages_cnt].rank = req_watcher->curr_rank;
}
else
{
if (VPID_EQ (&(ordered_holders_info[saved_pages_cnt].group_id), req_vpid))
{
ordered_holders_info[saved_pages_cnt].rank = PGBUF_ORDERED_HEAP_HDR;
}
else
{
/* leave rank set by user */
ordered_holders_info[saved_pages_cnt].rank = req_watcher->curr_rank;
}
}
ordered_holders_info[saved_pages_cnt].prevent_dealloc = false;
saved_pages_cnt++;
}
if (saved_pages_cnt > 1)
{
qsort (ordered_holders_info, saved_pages_cnt, sizeof (ordered_holders_info[0]), pgbuf_compare_hold_vpid_for_sort);
}
/* restore fixes on previously unfixed pages and fix the requested page */
for (i = 0; i < saved_pages_cnt; i++)
{
if (VPID_EQ (req_vpid, &(ordered_holders_info[i].vpid)))
{
curr_request_mode = request_mode;
curr_fetch_mode = fetch_mode;
}
else
{
curr_request_mode = ordered_holders_info[i].latch_mode;
curr_fetch_mode = OLD_PAGE;
}
#if !defined(NDEBUG)
pgptr =
pgbuf_fix_debug (thread_p, &(ordered_holders_info[i].vpid), curr_fetch_mode, curr_request_mode,
PGBUF_UNCONDITIONAL_LATCH, caller_file, caller_line, caller_func);
#else
pgptr =
pgbuf_fix_release (thread_p, &(ordered_holders_info[i].vpid), curr_fetch_mode, curr_request_mode,
PGBUF_UNCONDITIONAL_LATCH);
#endif
if (pgptr == NULL)
{
er_status = er_errid ();
if (er_status == ER_INTERRUPTED)
{
/* this is expected */
goto exit;
}
if (er_status == ER_PB_BAD_PAGEID)
{
if (VPID_EQ (req_vpid, &(ordered_holders_info[i].vpid)) && fetch_mode == OLD_PAGE_MAYBE_DEALLOCATED)
{
/* page was deallocated between ftab snapshot and actual fix; this is expected with
* OLD_PAGE_MAYBE_DEALLOCATED. */
er_log_debug (ARG_FILE_LINE,
"pgbuf_ordered_fix: page %d|%d was deallocated (OLD_PAGE_MAYBE_DEALLOCATED mode).\n",
VPID_AS_ARGS (&ordered_holders_info[i].vpid));
}
else
{
/* page was probably deallocated? so has the impossible indeed happen?? */
assert (false);
er_log_debug (ARG_FILE_LINE, "pgbuf_ordered_fix: page %d|%d was deallocated an we told it not to!\n",
VPID_AS_ARGS (&ordered_holders_info[i].vpid));
}
}
if (!VPID_EQ (req_vpid, &(ordered_holders_info[i].vpid)))
{
int prev_er_status = er_status;
er_status = ER_PB_ORDERED_REFIX_FAILED;
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, er_status, 3, ordered_holders_info[i].vpid.volid,
ordered_holders_info[i].vpid.pageid, prev_er_status);
}
goto exit;
}
/* get holder of last fix: last fixed pages is in top of holder list, we use parse code just for safety */
for (holder = pgbuf_Pool.thrd_holder_info[thrd_idx].thrd_hold_list; holder != NULL; holder = holder->thrd_link)
{
if (VPID_EQ (&(holder->bufptr->vpid), &(ordered_holders_info[i].vpid)))
{
break;
}
}
assert (holder != NULL);
if (VPID_EQ (req_vpid, &(ordered_holders_info[i].vpid)))
{
ret_pgptr = pgptr;
if (has_dealloc_prevent_flag == true)
{
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
pgbuf_bcb_unregister_avoid_deallocation (bufptr);
has_dealloc_prevent_flag = false;
}
if (req_watcher != NULL)
{
#if !defined(NDEBUG)
pgbuf_add_watch_instance_internal (holder, pgptr, req_watcher, request_mode, true, caller_file,
caller_line);
#else
pgbuf_add_watch_instance_internal (holder, pgptr, req_watcher, request_mode, true);
#endif
req_page_has_watcher = true;
#if defined(PGBUF_ORDERED_DEBUG)
_er_log_debug (__FILE__, __LINE__,
"ordered_fix(%u) : fixed req page, VPID:(%d,%d), GROUP:%d,%d, "
"rank:%d, pgptr:%X, holder_fix_count:%d, holder_watch_count:%d, holder_fixed_at:%s, ",
ordered_fix_id, ordered_holders_info[i].vpid.volid, ordered_holders_info[i].vpid.pageid,
ordered_holders_info[i].group_id.volid, ordered_holders_info[i].group_id.pageid,
ordered_holders_info[i].rank, pgptr, holder->fix_count, holder->watch_count,
holder->fixed_at);
#endif
}
}
else
{
int j;
/* page is fixed, therefore avoiding deallocate is no longer necessary */
assert (ordered_holders_info[i].prevent_dealloc);
ordered_holders_info[i].prevent_dealloc = false;
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
pgbuf_bcb_unregister_avoid_deallocation (bufptr);
#if !defined (NDEBUG)
/* page after re-fix should have the same type as before unfix */
(void) pgbuf_check_page_ptype (thread_p, pgptr, ordered_holders_info[i].ptype);
#endif /* !NDEBUG */
#if defined(PGBUF_ORDERED_DEBUG)
_er_log_debug (__FILE__, __LINE__,
"ordered_fix(%u) : restore_holder:%X, VPID:(%d,%d), pgptr:%X, holder_fix_count:%d, "
"holder_watch_count:%d, holder_fixed_at:%s, saved_fix_cnt:%d, saved_watch_cnt:%d",
ordered_fix_id, holder, ordered_holders_info[i].vpid.volid,
ordered_holders_info[i].vpid.pageid, pgptr, holder->fix_count, holder->watch_count,
holder->fixed_at, ordered_holders_info[i].fix_cnt, ordered_holders_info[i].watch_count);
#endif
/* restore number of fixes for previously fixed page: just use pgbuf_fix since it is safer */
for (j = 1; j < ordered_holders_info[i].watch_count; j++)
{
#if !defined(NDEBUG)
pgptr =
pgbuf_fix_debug (thread_p, &(ordered_holders_info[i].vpid), curr_fetch_mode, curr_request_mode,
PGBUF_UNCONDITIONAL_LATCH, caller_file, caller_line, caller_func);
#else
pgptr =
pgbuf_fix_release (thread_p, &(ordered_holders_info[i].vpid), curr_fetch_mode, curr_request_mode,
PGBUF_UNCONDITIONAL_LATCH);
#endif
if (pgptr == NULL)
{
assert_release (false);
er_status = ER_FAILED_ASSERTION;
goto exit;
}
}
for (j = 0; j < ordered_holders_info[i].watch_count; j++)
{
#if !defined(NDEBUG)
pgbuf_add_watch_instance_internal (holder, pgptr, ordered_holders_info[i].watcher[j],
(PGBUF_LATCH_MODE) ordered_holders_info[i].watcher[j]->latch_mode,
false, caller_file, caller_line);
#else
pgbuf_add_watch_instance_internal (holder, pgptr, ordered_holders_info[i].watcher[j],
(PGBUF_LATCH_MODE) ordered_holders_info[i].watcher[j]->latch_mode,
false);
#endif
#if defined(PGBUF_ORDERED_DEBUG)
_er_log_debug (__FILE__, __LINE__,
"ordered_fix(%u) : restore_watcher:%X, GROUP:%d,%d, rank:%d/%d,"
" pgptr:%X, holder_fix_count:%d, holder_watch_count:%d, holder_fixed_at:%s",
ordered_fix_id, ordered_holders_info[i].watcher[j],
ordered_holders_info[i].watcher[j]->group_id.volid,
ordered_holders_info[i].watcher[j]->group_id.pageid,
ordered_holders_info[i].watcher[j]->curr_rank,
ordered_holders_info[i].watcher[j]->initial_rank,
ordered_holders_info[i].watcher[j]->pgptr, holder->fix_count, holder->watch_count,
holder->fixed_at);
#endif /* PGBUF_ORDERED_DEBUG */
}
}
}
exit:
if (er_status_get_hfid != NO_ERROR && er_status == NO_ERROR)
{
er_status = er_status_get_hfid;
}
assert (er_status != NO_ERROR || !VPID_ISNULL (&(req_watcher->group_id)));
if (ret_pgptr != NULL && er_status != NO_ERROR)
{
if (req_page_has_watcher)
{
pgbuf_ordered_unfix_and_init (thread_p, ret_pgptr, req_watcher);
}
else
{
pgbuf_unfix_and_init (thread_p, ret_pgptr);
}
}
if (req_page_has_group == false && ret_pgptr != NULL && req_watcher->curr_rank != PGBUF_ORDERED_HEAP_HDR
&& VPID_EQ (&req_watcher->group_id, req_vpid))
{
req_watcher->curr_rank = PGBUF_ORDERED_HEAP_HDR;
}
for (i = 0; i < saved_pages_cnt; i++)
{
if (ordered_holders_info[i].prevent_dealloc)
{
/* we need to remove prevent deallocate. */
PGBUF_BUFFER_HASH *hash_anchor = &pgbuf_Pool.buf_hash_table[PGBUF_HASH_VALUE (&ordered_holders_info[i].vpid)];
bufptr = pgbuf_search_hash_chain (thread_p, hash_anchor, &ordered_holders_info[i].vpid);
if (bufptr == NULL)
{
/* oops... no longer in buffer?? */
assert (false);
pthread_mutex_unlock (&hash_anchor->hash_mutex);
continue;
}
if (!pgbuf_bcb_should_avoid_deallocation (bufptr))
{
/* oops... deallocate not prevented */
assert (false);
}
else
{
pgbuf_bcb_unregister_avoid_deallocation (bufptr);
}
PGBUF_BCB_UNLOCK (bufptr);
}
}
return er_status;
}
/*
* pgbuf_get_groupid_and_unfix () - retrieves group identifier of page and performs unlatch if requested.
* return: error code
* req_vpid(in): id of page for which the group is needed (for debug)
* pgptr(in): page (already latched); only heap page allowed
* groupid(out): group identifier (VPID of HFID)
* do_unfix(in): if true, it unfixes the page.
*
* Note : helper function of ordered fix.
*/
static int
pgbuf_get_groupid_and_unfix (THREAD_ENTRY * thread_p, const VPID * req_vpid, PAGE_PTR * pgptr, VPID * groupid,
bool do_unfix)
{
OID cls_oid;
HFID hfid;
int er_status = NO_ERROR;
int thrd_idx;
assert (pgptr != NULL && *pgptr != NULL);
assert (groupid != NULL);
VPID_SET_NULL (groupid);
thrd_idx = thread_get_entry_index (thread_p);
/* get class oid and hfid */
er_status = heap_get_class_oid_from_page (thread_p, *pgptr, &cls_oid);
if (do_unfix == true)
{
/* release requested page to avoid deadlocks with catalog pages */
pgbuf_unfix_and_init (thread_p, *pgptr);
}
if (er_status != NO_ERROR)
{
return er_status;
}
assert (do_unfix == false || *pgptr == NULL);
if (OID_IS_ROOTOID (&cls_oid))
{
boot_find_root_heap (&hfid);
}
else
{
er_status = heap_get_class_info (thread_p, &cls_oid, &hfid, NULL, NULL);
}
if (er_status == NO_ERROR)
{
if (HFID_IS_NULL (&hfid))
{
/* the requested page does not belong to a heap */
er_status = ER_PB_ORDERED_NO_HEAP;
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, er_status, 2, req_vpid->volid, req_vpid->pageid);
}
else
{
groupid->volid = hfid.vfid.volid;
groupid->pageid = hfid.hpgid;
assert (!VPID_ISNULL (groupid));
}
}
return er_status;
}
/*
* pgbuf_ordered_unfix () - Unfix a page which was previously fixed with ordered_fix (has a page watcher)
* return: void
* thread_p(in):
* watcher_object(in/out): page watcher
*
*/
#if !defined (NDEBUG)
void
pgbuf_ordered_unfix_debug (THREAD_ENTRY * thread_p, PGBUF_WATCHER * watcher_object, const char *caller_file,
int caller_line, const char *caller_func)
#else /* NDEBUG */
void
pgbuf_ordered_unfix (THREAD_ENTRY * thread_p, PGBUF_WATCHER * watcher_object)
#endif /* NDEBUG */
{
PGBUF_HOLDER *holder;
PAGE_PTR pgptr;
PGBUF_WATCHER *watcher;
assert (watcher_object != NULL);
#if !defined(NDEBUG)
assert (watcher_object->magic == PGBUF_WATCHER_MAGIC_NUMBER);
#endif
if (watcher_object->pgptr == NULL)
{
assert_release (false);
return;
}
pgptr = watcher_object->pgptr;
assert (pgptr != NULL);
holder = pgbuf_get_holder (thread_p, pgptr);
assert_release (holder != NULL);
watcher = holder->last_watcher;
while (watcher != NULL)
{
if (watcher == watcher_object)
{
/* found */
break;
}
watcher = watcher->prev;
}
assert_release (watcher != NULL);
assert (holder->fix_count >= holder->watch_count);
pgbuf_remove_watcher (holder, watcher_object);
#if !defined(NDEBUG)
pgbuf_watcher_init_debug (watcher_object, caller_file, caller_line, false);
pgbuf_unfix_debug (thread_p, pgptr, caller_file, caller_line, caller_func);
#else
pgbuf_unfix (thread_p, pgptr);
#endif
}
/*
* pgbuf_add_watch_instance_internal () - Adds a page watcher for a fixed page
* holder(in): holder object
* pgptr(in): holder object
* watcher(in/out): page watcher
* latch_mode(in): latch mode used for fixing the page
* clear_unfix_flag(in): True to reset page_was_unfixed flag, false otherwise.
*
*/
#if !defined(NDEBUG)
STATIC_INLINE void
pgbuf_add_watch_instance_internal (PGBUF_HOLDER * holder, PAGE_PTR pgptr, PGBUF_WATCHER * watcher,
const PGBUF_LATCH_MODE latch_mode, const bool clear_unfix_flag,
const char *caller_file, const int caller_line)
#else
STATIC_INLINE void
pgbuf_add_watch_instance_internal (PGBUF_HOLDER * holder, PAGE_PTR pgptr, PGBUF_WATCHER * watcher,
const PGBUF_LATCH_MODE latch_mode, const bool clear_unfix_flag)
#endif
{
#if !defined(NDEBUG)
char *p;
#endif
assert (watcher != NULL);
assert (pgptr != NULL);
assert (holder != NULL);
assert (holder->watch_count < PGBUF_MAX_PAGE_WATCHERS);
assert (watcher->pgptr == NULL);
assert (watcher->next == NULL);
assert (watcher->prev == NULL);
if (holder->last_watcher == NULL)
{
assert (holder->first_watcher == NULL);
holder->first_watcher = watcher;
holder->last_watcher = watcher;
}
else
{
watcher->prev = holder->last_watcher;
(holder->last_watcher)->next = watcher;
holder->last_watcher = watcher;
}
watcher->pgptr = pgptr;
watcher->latch_mode = latch_mode;
if (clear_unfix_flag)
{
watcher->page_was_unfixed = false;
}
holder->watch_count += 1;
#if !defined(NDEBUG)
p = (char *) caller_file + strlen (caller_file);
while (p)
{
if (p == caller_file)
{
break;
}
if (*p == '/' || *p == '\\')
{
p++;
break;
}
p--;
}
snprintf (watcher->watched_at, sizeof (watcher->watched_at) - 1, "%s:%d", p, caller_line);
#endif
}
/*
* pgbuf_attach_watcher () - Add a watcher to a fixed page.
*
* return : Void.
* thread_p (in) : Thread entry.
* pgptr (in) : Fixed page pointer.
* latch_mode (in) : Latch mode.
* hfid (in) : Heap file identifier.
* watcher (out) : Page water.
*/
#if !defined (NDEBUG)
void
pgbuf_attach_watcher_debug (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, PGBUF_LATCH_MODE latch_mode, HFID * hfid,
PGBUF_WATCHER * watcher, const char *caller_file, const int caller_line)
#else /* NDEBUG */
void
pgbuf_attach_watcher (THREAD_ENTRY * thread_p, PAGE_PTR pgptr, PGBUF_LATCH_MODE latch_mode, HFID * hfid,
PGBUF_WATCHER * watcher)
#endif /* NDEBUG */
{
PGBUF_HOLDER *holder = NULL;
VPID header_vpid = VPID_INITIALIZER;
PGBUF_ORDERED_RANK rank;
assert (pgptr != NULL);
assert (watcher != NULL);
assert (hfid != NULL && !HFID_IS_NULL (hfid));
header_vpid.volid = hfid->vfid.volid;
header_vpid.pageid = hfid->hpgid;
/* Set current rank based on page being heap header or not. */
if (VPID_EQ (&header_vpid, pgbuf_get_vpid_ptr (pgptr)))
{
rank = PGBUF_ORDERED_HEAP_HDR;
}
else
{
rank = PGBUF_ORDERED_HEAP_NORMAL;
}
PGBUF_INIT_WATCHER (watcher, rank, hfid);
watcher->curr_rank = rank;
holder = pgbuf_get_holder (thread_p, pgptr);
assert (holder != NULL);
#if !defined (NDEBUG)
pgbuf_add_watch_instance_internal (holder, pgptr, watcher, latch_mode, true, caller_file, caller_line);
#else
pgbuf_add_watch_instance_internal (holder, pgptr, watcher, latch_mode, true);
#endif
}
/*
* pgbuf_get_holder () - Searches holder of fixed page
* Return : holder object or NULL if not found
* thread_p(in):
* pgptr(in): pgptr
*/
static PGBUF_HOLDER *
pgbuf_get_holder (THREAD_ENTRY * thread_p, PAGE_PTR pgptr)
{
int thrd_idx;
PGBUF_BCB *bufptr;
PGBUF_HOLDER *holder;
assert (pgptr != NULL);
if (thread_p == NULL)
{
thread_p = thread_get_thread_entry_info ();
assert (thread_p != NULL);
}
if (!thread_p->m_holder_anchor)
{
thread_p->m_holder_anchor = &pgbuf_Pool.thrd_holder_info[thread_p->index];
}
holder = thread_p->m_holder_anchor->thrd_hold_list;
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
for (; holder != NULL; holder = holder->thrd_link)
{
if (bufptr == holder->bufptr)
{
return holder;
}
}
return NULL;
}
/*
* pgbuf_remove_watcher () - Removes a page watcher
* holder(in): holder object
* watcher_object(in): watcher object
*/
static void
pgbuf_remove_watcher (PGBUF_HOLDER * holder, PGBUF_WATCHER * watcher_object)
{
PAGE_PTR pgptr;
assert (watcher_object != NULL);
assert (holder != NULL);
#if !defined(NDEBUG)
assert (watcher_object->magic == PGBUF_WATCHER_MAGIC_NUMBER);
#endif
pgptr = watcher_object->pgptr;
if (holder->first_watcher == watcher_object)
{
assert (watcher_object->prev == NULL);
holder->first_watcher = watcher_object->next;
}
else if (watcher_object->prev != NULL)
{
(watcher_object->prev)->next = watcher_object->next;
}
if (holder->last_watcher == watcher_object)
{
assert (watcher_object->next == NULL);
holder->last_watcher = watcher_object->prev;
}
else if (watcher_object->next != NULL)
{
(watcher_object->next)->prev = watcher_object->prev;
}
watcher_object->next = NULL;
watcher_object->prev = NULL;
watcher_object->pgptr = NULL;
watcher_object->curr_rank = PGBUF_ORDERED_RANK_UNDEFINED;
holder->watch_count -= 1;
}
/*
* pgbuf_replace_watcher () - Replaces a page watcher with another page watcher
* thread_p(in):
* old_watcher(in/out): current page watcher to replace
* new_watcher(in/out): new page watcher to use
*
*/
#if !defined(NDEBUG)
void
pgbuf_replace_watcher_debug (THREAD_ENTRY * thread_p, PGBUF_WATCHER * old_watcher, PGBUF_WATCHER * new_watcher,
const char *caller_file, const int caller_line)
#else
void
pgbuf_replace_watcher (THREAD_ENTRY * thread_p, PGBUF_WATCHER * old_watcher, PGBUF_WATCHER * new_watcher)
#endif
{
PGBUF_HOLDER *holder;
PAGE_PTR page_ptr;
PGBUF_LATCH_MODE latch_mode;
assert (old_watcher != NULL);
assert (PGBUF_IS_CLEAN_WATCHER (new_watcher));
#if !defined(NDEBUG)
assert (old_watcher->magic == PGBUF_WATCHER_MAGIC_NUMBER);
assert (new_watcher->magic == PGBUF_WATCHER_MAGIC_NUMBER);
#endif
assert (old_watcher->pgptr != NULL);
holder = pgbuf_get_holder (thread_p, old_watcher->pgptr);
assert_release (holder != NULL);
page_ptr = old_watcher->pgptr;
latch_mode = (PGBUF_LATCH_MODE) old_watcher->latch_mode;
new_watcher->initial_rank = old_watcher->initial_rank;
new_watcher->curr_rank = old_watcher->curr_rank;
VPID_COPY (&new_watcher->group_id, &old_watcher->group_id);
pgbuf_remove_watcher (holder, old_watcher);
#if !defined(NDEBUG)
pgbuf_watcher_init_debug (old_watcher, caller_file, caller_line, false);
pgbuf_add_watch_instance_internal (holder, page_ptr, new_watcher, latch_mode, true, caller_file, caller_line);
#else
pgbuf_add_watch_instance_internal (holder, page_ptr, new_watcher, latch_mode, true);
#endif
}
/*
* pgbuf_ordered_set_dirty_and_free () - Mark as modified the buffer associated and unfixes the page
* (previously fixed with ordered fix)
* return: void
* thread_p(in):
* pg_watcher(in): page watcher holding the page to dirty and unfix
*/
void
pgbuf_ordered_set_dirty_and_free (THREAD_ENTRY * thread_p, PGBUF_WATCHER * pg_watcher)
{
pgbuf_set_dirty (thread_p, pg_watcher->pgptr, DONT_FREE);
pgbuf_ordered_unfix (thread_p, pg_watcher);
}
/*
* pgbuf_get_condition_for_ordered_fix () - returns the condition which should
* be used to latch (vpid_new_page) knowing that we already have a latch on
* (vpid_fixed_page)
*
* return: latch condition (PGBUF_LATCH_CONDITION)
* vpid_new_page(in):
* vpid_fixed_page(in):
* hfid(in): HFID of both pages
*
* Note: This is intended only for HEAP/HEAP_OVERFLOW pages.
* The user should make sure both pages belong to the same heap.
* To be used when pgbuf_ordered_fix is not possible:
* In vacuum context, unfixing a older page to prevent deadlatch,
* requires flushing of the old page first - this is not possible with
* pgbuf_ordered_fix.
*/
int
pgbuf_get_condition_for_ordered_fix (const VPID * vpid_new_page, const VPID * vpid_fixed_page, const HFID * hfid)
{
PGBUF_HOLDER_INFO new_page_holder_info;
PGBUF_HOLDER_INFO fixed_page_holder_info;
new_page_holder_info.group_id.volid = hfid->vfid.volid;
new_page_holder_info.group_id.pageid = hfid->hpgid;
fixed_page_holder_info.group_id.volid = hfid->vfid.volid;
fixed_page_holder_info.group_id.pageid = hfid->hpgid;
VPID_COPY (&new_page_holder_info.vpid, vpid_new_page);
VPID_COPY (&fixed_page_holder_info.vpid, vpid_fixed_page);
if (VPID_EQ (&new_page_holder_info.group_id, &new_page_holder_info.vpid))
{
new_page_holder_info.rank = PGBUF_ORDERED_HEAP_HDR;
}
else
{
new_page_holder_info.rank = PGBUF_ORDERED_HEAP_NORMAL;
}
if (VPID_EQ (&fixed_page_holder_info.group_id, &fixed_page_holder_info.vpid))
{
fixed_page_holder_info.rank = PGBUF_ORDERED_HEAP_HDR;
}
else
{
fixed_page_holder_info.rank = PGBUF_ORDERED_HEAP_NORMAL;
}
if (pgbuf_compare_hold_vpid_for_sort (&new_page_holder_info, &fixed_page_holder_info) < 0)
{
return PGBUF_CONDITIONAL_LATCH;
}
return PGBUF_UNCONDITIONAL_LATCH;
}
#if !defined(NDEBUG)
/*
* pgbuf_watcher_init_debug () -
* return: void
* watcher(in/out):
* add(in): if add or reset the "init" field
*/
void
pgbuf_watcher_init_debug (PGBUF_WATCHER * watcher, const char *caller_file, const int caller_line, bool add)
{
char *p;
p = (char *) caller_file + strlen (caller_file);
while (p)
{
if (p == caller_file)
{
break;
}
if (*p == '/' || *p == '\\')
{
p++;
break;
}
p--;
}
if (add)
{
char prev_init[256];
strncpy (prev_init, watcher->init_at, sizeof (watcher->init_at) - 1);
prev_init[sizeof (prev_init) - 1] = '\0';
snprintf_dots_truncate (watcher->init_at, sizeof (watcher->init_at) - 1, "%s:%d %s", p, caller_line, prev_init);
}
else
{
snprintf (watcher->init_at, sizeof (watcher->init_at) - 1, "%s:%d", p, caller_line);
}
}
/*
* pgbuf_is_page_fixed_by_thread () -
* return: true if page is already fixed, false otherwise
* thread_p(in): thread entry
* vpid_p(in): virtual page id
*/
bool
pgbuf_is_page_fixed_by_thread (THREAD_ENTRY * thread_p, const VPID * vpid_p)
{
int thrd_index;
PGBUF_HOLDER_ANCHOR *thrd_holder_info;
PGBUF_HOLDER *thrd_holder;
assert (vpid_p != NULL);
/* walk holders and try to find page */
thrd_index = thread_get_entry_index (thread_p);
thrd_holder_info = &(pgbuf_Pool.thrd_holder_info[thrd_index]);
for (thrd_holder = thrd_holder_info->thrd_hold_list; thrd_holder != NULL; thrd_holder = thrd_holder->next_holder)
{
if (VPID_EQ (&thrd_holder->bufptr->vpid, vpid_p))
{
return true;
}
}
return false;
}
#endif
/*
* pgbuf_initialize_page_quota_parameters () - Initializes page quota parameters
*
* return: NO_ERROR, or ER_code
*
* Note: Call this before any LRU initialization
*/
static int
pgbuf_initialize_page_quota_parameters (void)
{
PGBUF_PAGE_QUOTA *quota;
quota = &(pgbuf_Pool.quota);
memset (quota, 0, sizeof (PGBUF_PAGE_QUOTA));
tsc_getticks ("a->last_adjust_time);
quota->adjust_age = 0;
quota->is_adjusting = 0;
#if defined (SERVER_MODE)
quota->num_private_LRU_list = prm_get_integer_value (PRM_ID_PB_NUM_PRIVATE_CHAINS);
if (quota->num_private_LRU_list == -1)
{
/* set value automatically to maximum number of workers (active and vacuum). */
quota->num_private_LRU_list = MAX_NTRANS + VACUUM_MAX_WORKER_COUNT;
}
else if (quota->num_private_LRU_list == 0)
{
/* disabled */
}
else
{
/* set number of workers to the number desired by user (or to minimum accepted) */
if (quota->num_private_LRU_list < PGBUF_PRIVATE_LRU_MIN_COUNT)
{
/* set to minimum count */
quota->num_private_LRU_list = PGBUF_PRIVATE_LRU_MIN_COUNT;
}
}
#else /* !SERVER_MODE */ /* SA_MODE */
/* stand-alone quota is disabled */
quota->num_private_LRU_list = 0;
#endif /* SA_MODE */
return NO_ERROR;
}
/*
* pgbuf_initialize_page_quota () - Initializes page quota
* return: NO_ERROR, or ER_code
*/
static int
pgbuf_initialize_page_quota (void)
{
PGBUF_PAGE_QUOTA *quota;
int i;
int error_status = NO_ERROR;
quota = &(pgbuf_Pool.quota);
quota->lru_victim_flush_priority_per_lru =
(float *) malloc (PGBUF_TOTAL_LRU_COUNT * sizeof (quota->lru_victim_flush_priority_per_lru[0]));
if (quota->lru_victim_flush_priority_per_lru == NULL)
{
error_status = ER_OUT_OF_VIRTUAL_MEMORY;
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY,
1, (PGBUF_TOTAL_LRU_COUNT * sizeof (quota->lru_victim_flush_priority_per_lru[0])));
goto exit;
}
quota->private_lru_session_cnt =
(int *) malloc (PGBUF_PRIVATE_LRU_COUNT * sizeof (quota->private_lru_session_cnt[0]));
if (quota->private_lru_session_cnt == NULL)
{
error_status = ER_OUT_OF_VIRTUAL_MEMORY;
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY,
1, (PGBUF_TOTAL_LRU_COUNT * sizeof (quota->private_lru_session_cnt[0])));
goto exit;
}
/* initialize the quota data for each LRU */
for (i = 0; i < PGBUF_TOTAL_LRU_COUNT; i++)
{
quota->lru_victim_flush_priority_per_lru[i] = 0;
if (PGBUF_IS_PRIVATE_LRU_INDEX (i))
{
quota->private_lru_session_cnt[PGBUF_PRIVATE_LIST_FROM_LRU_INDEX (i)] = 0;
}
}
if (PGBUF_PAGE_QUOTA_IS_ENABLED)
{
quota->private_pages_ratio = 1.0f;
}
else
{
quota->private_pages_ratio = 0;
}
quota->add_shared_lru_idx = 0;
quota->avoid_shared_lru_idx = -1;
exit:
return error_status;
}
/*
* pgbuf_initialize_page_monitor () - Initializes page monitor
* return: NO_ERROR, or ER_code
*/
static int
pgbuf_initialize_page_monitor (void)
{
PGBUF_PAGE_MONITOR *monitor;
int i;
int error_status = NO_ERROR;
#if defined (SERVER_MODE)
size_t count_threads = thread_num_total_threads ();
#endif /* SERVER_MODE */
monitor = &(pgbuf_Pool.monitor);
monitor->dirties_cnt = 0;
monitor->lru_hits = NULL;
monitor->lru_activity = NULL;
#if defined (SERVER_MODE)
monitor->bcb_locks = NULL;
#endif /* SERVER_MODE */
monitor->victim_rich = false;
monitor->lru_hits = (int *) malloc (PGBUF_TOTAL_LRU_COUNT * sizeof (monitor->lru_hits[0]));
if (monitor->lru_hits == NULL)
{
error_status = ER_OUT_OF_VIRTUAL_MEMORY;
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY,
1, (PGBUF_TOTAL_LRU_COUNT * sizeof (monitor->lru_hits[0])));
goto exit;
}
monitor->lru_activity = (int *) malloc (PGBUF_TOTAL_LRU_COUNT * sizeof (monitor->lru_activity[0]));
if (monitor->lru_activity == NULL)
{
error_status = ER_OUT_OF_VIRTUAL_MEMORY;
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY,
1, (PGBUF_TOTAL_LRU_COUNT * sizeof (monitor->lru_activity[0])));
goto exit;
}
/* initialize the monitor data for each LRU */
for (i = 0; i < PGBUF_TOTAL_LRU_COUNT; i++)
{
monitor->lru_hits[i] = 0;
monitor->lru_activity[i] = 0;
}
monitor->lru_victim_req_cnt = 0;
monitor->fix_req_cnt.store (0);
monitor->pg_unfix_cnt.store (0);
monitor->lru_shared_pgs_cnt = 0;
#if defined (SERVER_MODE)
if (pgbuf_Monitor_locks)
{
monitor->bcb_locks = (PGBUF_MONITOR_BCB_MUTEX *) calloc (count_threads, sizeof (PGBUF_MONITOR_BCB_MUTEX));
if (monitor->bcb_locks == NULL)
{
error_status = ER_OUT_OF_VIRTUAL_MEMORY;
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1,
count_threads * sizeof (PGBUF_MONITOR_BCB_MUTEX));
goto exit;
}
}
#endif /* SERVER_MDOE */
/* no bcb's, no victims */
monitor->victim_rich = false;
exit:
return error_status;
}
/*
* pgbuf_compute_lru_vict_target () -
*
* lru_sum_flush_priority(out) : sum of all flush priorities of all LRUs
* return : void
*/
static void
pgbuf_compute_lru_vict_target (float *lru_sum_flush_priority)
{
int i;
float prv_quota;
float prv_real_ratio;
float diff;
float prv_flush_ratio;
float shared_flush_ratio;
bool use_prv_size = false;
int total_prv_target = 0;
int this_prv_target = 0;
PGBUF_LRU_LIST *lru_list;
assert (lru_sum_flush_priority != NULL);
*lru_sum_flush_priority = 0;
prv_quota = pgbuf_Pool.quota.private_pages_ratio;
assert (pgbuf_Pool.monitor.lru_shared_pgs_cnt >= 0
&& pgbuf_Pool.monitor.lru_shared_pgs_cnt <= pgbuf_Pool.num_buffers);
prv_real_ratio = 1.0f - ((float) pgbuf_Pool.monitor.lru_shared_pgs_cnt / pgbuf_Pool.num_buffers);
diff = prv_quota - prv_real_ratio;
prv_flush_ratio = prv_real_ratio * (1.0f - diff);
prv_flush_ratio = MIN (1.0f, prv_flush_ratio);
for (i = PGBUF_LRU_INDEX_FROM_PRIVATE (0); i < PGBUF_TOTAL_LRU_COUNT; i++)
{
lru_list = PGBUF_GET_LRU_LIST (i);
/* note: we target especially over quota private lists or close to quota. we cannot target only over quota lists
* (I tried), because you may find yourself in the peculiar case where quota's are on par with list size, while
* shared are right below minimum desired size... and flush will not find anything.
*/
this_prv_target = PGBUF_LRU_LIST_COUNT (lru_list) - (int) (lru_list->quota * 0.9);
this_prv_target = MIN (this_prv_target, lru_list->count_lru3);
if (this_prv_target > 0)
{
total_prv_target += this_prv_target;
}
}
if (total_prv_target == 0)
{
/* can we victimize from shared? */
if (pgbuf_Pool.monitor.lru_shared_pgs_cnt
<= (int) (pgbuf_Pool.num_LRU_list * PGBUF_MIN_SHARED_LIST_ADJUST_SIZE
* (pgbuf_Pool.ratio_lru1 + pgbuf_Pool.ratio_lru2)))
{
/* we won't be able to victimize from shared. this is a backup hack, I don't like to rely on it. let's
* find smarter ways to avoid the case. */
/* right now, considering we target all bcb's beyond 90% of quota, but total_prv_target is 0, that means all
* private bcb's must be less than 90% of buffer. that means shared bcb's have to be 10% or more of buffer.
* PGBUF_MIN_SHARED_LIST_ADJUST_SIZE is currently set to 50, which is 5% to targeted 1k shared list size.
* we shouldn't be here unless I messed up the calculus. */
if (pgbuf_Pool.buf_invalid_list.invalid_cnt > 0)
{
/* This is not really an interesting case.
* Probably both shared and private are small and most of buffers in invalid list.
* We don't really need flush for the case, since BCB could be allocated from invalid list.
*/
return;
}
assert (false);
use_prv_size = true;
prv_flush_ratio = 1.0f;
/* we can compute the zone 3 total size (for privates, zones 1 & 2 are both set to minimum ratio). */
total_prv_target =
(int) ((pgbuf_Pool.num_buffers - pgbuf_Pool.monitor.lru_shared_pgs_cnt)
* (1.0f - 2 * PGBUF_LRU_ZONE_MIN_RATIO));
}
}
shared_flush_ratio = 1.0f - prv_flush_ratio;
for (i = 0; i < PGBUF_TOTAL_LRU_COUNT; i++)
{
lru_list = PGBUF_GET_LRU_LIST (i);
if (PGBUF_IS_SHARED_LRU_INDEX (i))
{
pgbuf_Pool.quota.lru_victim_flush_priority_per_lru[i] = shared_flush_ratio / (float) PGBUF_SHARED_LRU_COUNT;
}
else if (PGBUF_IS_PRIVATE_LRU_INDEX (i))
{
if (prv_flush_ratio == 0.0f)
{
pgbuf_Pool.quota.lru_victim_flush_priority_per_lru[i] = 0.0f;
}
else
{
if (use_prv_size)
{
/* back plan: use zone 3 size instead of computed target based on quota. */
this_prv_target = lru_list->count_lru3;
}
else
{
/* use bcb's over 90% of quota as flush target */
this_prv_target = PGBUF_LRU_LIST_COUNT (lru_list) - (int) (lru_list->quota * 0.9);
this_prv_target = MIN (this_prv_target, lru_list->count_lru3);
}
if (this_prv_target > 0)
{
pgbuf_Pool.quota.lru_victim_flush_priority_per_lru[i] =
prv_flush_ratio * ((float) this_prv_target / (float) total_prv_target);
}
else
{
pgbuf_Pool.quota.lru_victim_flush_priority_per_lru[i] = 0.0f;
}
}
}
else
{
pgbuf_Pool.quota.lru_victim_flush_priority_per_lru[i] = 0.0f;
}
*lru_sum_flush_priority += pgbuf_Pool.quota.lru_victim_flush_priority_per_lru[i];
}
}
/*
* pgbuf_adjust_quotas () - Adjusts the quotas for private LRU's. The quota's are decided based on thread activities on
* private and shared lists. Activity is counted as number of accessed pages.
* Based on quota's, the thread also sets zone thresholds for each LRU.
*
* return : void
* thread_p (in) : thread entry
*/
void
pgbuf_adjust_quotas (THREAD_ENTRY * thread_p)
{
#define MAX_PRIVATE_RATIO 0.998f
#define MIN_PRIVATE_RATIO 0.01f
PGBUF_PAGE_QUOTA *quota;
PGBUF_PAGE_MONITOR *monitor;
int i;
int all_private_quota;
int sum_private_lru_activity_total = 0;
TSC_TICKS curr_tick;
INT64 diff_usec;
int lru_hits;
int lru_shared_hits = 0;
int lru_private_hits = 0;
float private_ratio;
int avg_shared_lru_size;
int shared_threshold_lru1;
int shared_threshold_lru2;
int new_quota;
float new_lru_ratio;
const INT64 onesec_usec = 1000000LL;
const INT64 tensec_usec = 10 * onesec_usec;
int total_victims = 0;
bool low_overall_activity = false;
PGBUF_LRU_LIST *lru_list;
if (thread_p == NULL)
{
assert (thread_p != NULL);
thread_p = thread_get_thread_entry_info ();
}
quota = &(pgbuf_Pool.quota);
monitor = &(pgbuf_Pool.monitor);
if (!PGBUF_PAGE_QUOTA_IS_ENABLED || quota->is_adjusting)
{
return;
}
quota->is_adjusting = 1;
tsc_getticks (&curr_tick);
diff_usec = tsc_elapsed_utime (curr_tick, quota->last_adjust_time);
if (diff_usec < 1000LL)
{
/* less than 1 msec. stop */
quota->is_adjusting = 0;
return;
}
/* quota adjust if :
* - or more than 500 msec since last adjustment and activity is more than threshold
* - or more than 5 min since last adjustment and activity is more 1% of threshold
* Activity of page buffer is measured in number of page unfixes
*/
if (pgbuf_Pool.monitor.pg_unfix_cnt.load (std::memory_order_seq_cst) < PGBUF_TRAN_THRESHOLD_ACTIVITY
&& diff_usec < 500000LL)
{
quota->is_adjusting = 0;
return;
}
if (monitor->pg_unfix_cnt.exchange (0) < PGBUF_TRAN_THRESHOLD_ACTIVITY / 100)
{
low_overall_activity = true;
}
quota->last_adjust_time = curr_tick;
(void) ATOMIC_INC_32 ("a->adjust_age, 1);
/* process hits since last adjust:
* 1. collect lru_private_hits and lru_shared_hits.
* 2. update each private list activity.
* 3. collect total activity.
*/
for (i = 0; i < PGBUF_TOTAL_LRU_COUNT; i++)
{
/* get hits since last adjust and reset */
lru_hits = ATOMIC_TAS_32 (&monitor->lru_hits[i], 0);
/* compute hits per second */
lru_hits = (int) (onesec_usec * lru_hits / diff_usec);
if (PGBUF_IS_PRIVATE_LRU_INDEX (i))
{
/* adjust private lru activity. for convenience reasons, we consider that previous lru_activity value was same
* for 10 seconds minus the time since last adjustment. if previous adjustment is more than 10 seconds old
* then we set new activity. */
if (diff_usec >= tensec_usec)
{
/* set current activity */
monitor->lru_activity[i] = lru_hits;
}
else
{
/* interpolate old activity with new activity */
monitor->lru_activity[i] =
(int) (((tensec_usec - diff_usec) * monitor->lru_activity[i] + diff_usec * lru_hits) / tensec_usec);
}
/* collect to total activity */
sum_private_lru_activity_total += monitor->lru_activity[i];
/* collect to total private hits */
lru_private_hits += lru_hits;
}
else
{
/* collect to total shared hits */
lru_shared_hits += lru_hits;
}
lru_list = PGBUF_GET_LRU_LIST (i);
total_victims += lru_list->count_vict_cand;
}
/* compute private ratio */
if (low_overall_activity)
{
private_ratio = MIN_PRIVATE_RATIO;
}
else
{
/* avoid division by 0 */
lru_shared_hits = MAX (1, lru_shared_hits);
private_ratio = (float) (lru_private_hits) / (float) (lru_private_hits + lru_shared_hits);
private_ratio = MIN (MAX_PRIVATE_RATIO, private_ratio);
private_ratio = MAX (MIN_PRIVATE_RATIO, private_ratio);
}
if (diff_usec >= tensec_usec)
{
quota->private_pages_ratio = private_ratio;
}
else
{
quota->private_pages_ratio =
((quota->private_pages_ratio * (float) (tensec_usec - diff_usec) + private_ratio * (float) diff_usec)
/ (float) tensec_usec);
}
if (sum_private_lru_activity_total == 0)
{
/* no private activity */
/* well I guess we can just set all quota's to 0. */
all_private_quota = 0;
for (i = PGBUF_SHARED_LRU_COUNT; i < PGBUF_TOTAL_LRU_COUNT; i++)
{
lru_list = PGBUF_GET_LRU_LIST (i);
lru_list->quota = 0;
lru_list->threshold_lru1 = 0;
lru_list->threshold_lru2 = 0;
if (lru_list->count_lru1 + lru_list->count_lru2 > 0)
{
pthread_mutex_lock (&lru_list->mutex);
pgbuf_lru_adjust_zones (thread_p, lru_list, false);
pthread_mutex_unlock (&lru_list->mutex);
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
}
if (lru_list->count_vict_cand > 0 && PGBUF_LRU_LIST_IS_OVER_QUOTA (lru_list))
{
/* make sure this is added to victim list */
if (pgbuf_lfcq_add_lru_with_victims (lru_list)
&& perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
/* added to queue of lru lists having victims. */
}
}
}
}
else
{
/* compute all_private_quota in number of bcb's */
all_private_quota =
(int) ((pgbuf_Pool.num_buffers - pgbuf_Pool.buf_invalid_list.invalid_cnt) * quota->private_pages_ratio);
/* split private bcb's quota's based on activity */
for (i = PGBUF_SHARED_LRU_COUNT; i < PGBUF_TOTAL_LRU_COUNT; i++)
{
if (monitor->lru_activity[i] > 0)
{
new_lru_ratio = (float) monitor->lru_activity[i] / (float) sum_private_lru_activity_total;
}
else
{
new_lru_ratio = 0.0f;
}
new_quota = (int) (new_lru_ratio * all_private_quota);
new_quota = MIN (new_quota, PGBUF_PRIVATE_LRU_MAX_HARD_QUOTA);
new_quota = MIN (new_quota, pgbuf_Pool.num_buffers / 2);
lru_list = PGBUF_GET_LRU_LIST (i);
lru_list->quota = new_quota;
lru_list->threshold_lru1 = (int) (new_quota * PGBUF_LRU_ZONE_MIN_RATIO);
lru_list->threshold_lru2 = (int) (new_quota * PGBUF_LRU_ZONE_MIN_RATIO);
if (PGBUF_LRU_LIST_IS_ONE_TWO_OVER_QUOTA (lru_list))
{
pthread_mutex_lock (&lru_list->mutex);
pgbuf_lru_adjust_zones (thread_p, lru_list, false);
pthread_mutex_unlock (&lru_list->mutex);
PGBUF_BCB_CHECK_MUTEX_LEAKS ();
}
if (lru_list->count_vict_cand > 0 && PGBUF_LRU_LIST_IS_OVER_QUOTA (lru_list))
{
/* make sure this is added to victim list */
if (pgbuf_lfcq_add_lru_with_victims (lru_list)
&& perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
/* added to queue of lru lists having victims. */
}
}
}
}
/* set shared target size */
avg_shared_lru_size = (pgbuf_Pool.num_buffers - all_private_quota) / pgbuf_Pool.num_LRU_list;
avg_shared_lru_size = MAX (avg_shared_lru_size, PGBUF_MIN_SHARED_LIST_ADJUST_SIZE);
shared_threshold_lru1 = (int) (avg_shared_lru_size * pgbuf_Pool.ratio_lru1);
shared_threshold_lru2 = (int) (avg_shared_lru_size * pgbuf_Pool.ratio_lru2);
for (i = 0; i < PGBUF_SHARED_LRU_COUNT; i++)
{
lru_list = PGBUF_GET_LRU_LIST (i);
lru_list->threshold_lru1 = shared_threshold_lru1;
lru_list->threshold_lru2 = shared_threshold_lru2;
if (PGBUF_LRU_ARE_ZONES_ONE_TWO_OVER_THRESHOLD (lru_list))
{
pthread_mutex_lock (&lru_list->mutex);
pgbuf_lru_adjust_zones (thread_p, lru_list, false);
pthread_mutex_unlock (&lru_list->mutex);
}
if (lru_list->count_vict_cand > 0)
{
/* make sure this is added to victim list */
if (pgbuf_lfcq_add_lru_with_victims (lru_list)
&& perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
/* added to queue of lru lists having victims. */
}
}
}
/* is pool victim rich? we consider this true if the victim count is more than 10% of page buffer. I think we could
* lower the bar a little bit */
pgbuf_Pool.monitor.victim_rich = total_victims >= (int) (0.1 * pgbuf_Pool.num_buffers);
quota->is_adjusting = 0;
}
/*
* pgbuf_assign_private_lru () -
*
* return: NO_ERROR
*/
int
pgbuf_assign_private_lru (THREAD_ENTRY * thread_p)
{
int i;
int min_activitity;
int min_bcbs;
int lru_cand_idx, lru_cand_zero_sessions;
int private_idx;
int cnt_lru;
PGBUF_PAGE_MONITOR *monitor;
PGBUF_PAGE_QUOTA *quota;
int retry_cnt = 0;
if (!PGBUF_PAGE_QUOTA_IS_ENABLED)
{
return -1;
}
monitor = &pgbuf_Pool.monitor;
quota = &pgbuf_Pool.quota;
/* Priority for choosing a private list :
* 1. the list with zero sessions having the least number of pages
* 2. the list having least activity */
retry:
lru_cand_zero_sessions = -1;
lru_cand_idx = -1;
min_bcbs = pgbuf_Pool.num_buffers;
min_activitity = PGBUF_TRAN_MAX_ACTIVITY;
for (i = PGBUF_SHARED_LRU_COUNT; i < PGBUF_TOTAL_LRU_COUNT; i++)
{
if (quota->private_lru_session_cnt[PGBUF_PRIVATE_LIST_FROM_LRU_INDEX (i)] == 0)
{
cnt_lru = PGBUF_LRU_LIST_COUNT (PGBUF_GET_LRU_LIST (i));
if (cnt_lru < min_bcbs)
{
min_bcbs = cnt_lru;
lru_cand_zero_sessions = i;
if (min_bcbs <= 0)
{
break;
}
}
}
if (monitor->lru_activity[i] < min_activitity)
{
min_activitity = monitor->lru_activity[i];
lru_cand_idx = i;
}
}
if (lru_cand_zero_sessions != -1)
{
lru_cand_idx = lru_cand_zero_sessions;
}
assert (lru_cand_idx != -1);
cnt_lru = PGBUF_LRU_LIST_COUNT (PGBUF_GET_LRU_LIST (lru_cand_idx));
private_idx = PGBUF_PRIVATE_LIST_FROM_LRU_INDEX (lru_cand_idx);
if (lru_cand_zero_sessions != -1)
{
if (ATOMIC_INC_32 ("a->private_lru_session_cnt[private_idx], 1) > 1)
{
/* another thread stole this lru, retry */
if (retry_cnt++ < 5)
{
ATOMIC_INC_32 ("a->private_lru_session_cnt[private_idx], -1);
goto retry;
}
}
}
else
{
ATOMIC_INC_32 ("a->private_lru_session_cnt[private_idx], 1);
}
/* TODO: is this necessary? */
pgbuf_adjust_quotas (thread_p);
return private_idx;
}
/*
* pgbuf_release_private_lru () -
* return: NO_ERROR
* bufptr(in): pointer to buffer page
*
* Note: This function puts BCB to the bottom of the LRU list.
*/
int
pgbuf_release_private_lru (THREAD_ENTRY * thread_p, const int private_idx)
{
if (PGBUF_PAGE_QUOTA_IS_ENABLED && private_idx >= 0 && private_idx < PGBUF_PRIVATE_LRU_COUNT
&& pgbuf_Pool.num_buffers > 0)
{
if (ATOMIC_INC_32 (&pgbuf_Pool.quota.private_lru_session_cnt[private_idx], -1) <= 0)
{
ATOMIC_TAS_32 (&pgbuf_Pool.monitor.lru_activity[PGBUF_LRU_INDEX_FROM_PRIVATE (private_idx)], 0);
/* TODO: is this necessary? */
pgbuf_adjust_quotas (thread_p);
}
}
return NO_ERROR;
}
/*
* pgbuf_initialize_seq_flusher () - Initializes sequential flusher on a list of pages to be flushed
*
* return: error code
* seq_flusher(in/out):
* f_list(in/out): flush list to use or NULL if needs to be allocated
* cnt(in/out): size of flush list
*/
static int
pgbuf_initialize_seq_flusher (PGBUF_SEQ_FLUSHER * seq_flusher, PGBUF_VICTIM_CANDIDATE_LIST * f_list, const int cnt)
{
int alloc_size;
memset (seq_flusher, 0, sizeof (*seq_flusher));
seq_flusher->flush_max_size = cnt;
if (f_list != NULL)
{
seq_flusher->flush_list = f_list;
}
else
{
alloc_size = seq_flusher->flush_max_size * sizeof (seq_flusher->flush_list[0]);
seq_flusher->flush_list = (PGBUF_VICTIM_CANDIDATE_LIST *) malloc (alloc_size);
if (seq_flusher->flush_list == NULL)
{
er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, alloc_size);
return ER_OUT_OF_VIRTUAL_MEMORY;
}
}
seq_flusher->flush_cnt = 0;
seq_flusher->flush_idx = 0;
seq_flusher->burst_mode = true;
seq_flusher->control_intervals_cnt = 0;
seq_flusher->control_flushed = 0;
return NO_ERROR;
}
/*
* pgbuf_has_any_waiters () - Quick check if page has any waiters.
*
* return : True if page has any waiters, false otherwise.
* pgptr (in) : Page pointer.
*/
bool
pgbuf_has_any_waiters (PAGE_PTR pgptr)
{
#if defined (SERVER_MODE)
PGBUF_BCB *bufptr = NULL;
bool has_waiter;
/* note: we rule out flush waiters here */
assert (pgptr != NULL);
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
PGBUF_BCB_LOCK (bufptr);
has_waiter = pgbuf_is_exist_blocked_reader_writer (bufptr);
PGBUF_BCB_UNLOCK (bufptr);
return has_waiter;
#else
return false;
#endif
}
/*
* pgbuf_has_any_non_vacuum_waiters () - Check if page has any non-vacuum waiters.
*
* return : True if page has waiters, false otherwise.
* pgptr (in) : Page pointer.
*/
bool
pgbuf_has_any_non_vacuum_waiters (PAGE_PTR pgptr)
{
#if defined (SERVER_MODE)
PGBUF_BCB *bufptr = NULL;
THREAD_ENTRY *thread_entry_p;
assert (pgptr != NULL);
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
thread_entry_p = bufptr->next_wait_thrd;
while (thread_entry_p != NULL)
{
if (thread_entry_p->type != TT_VACUUM_WORKER)
{
return true;
}
thread_entry_p = thread_entry_p->next_wait_thrd;
}
return false;
#else
return false;
#endif
}
/*
* pgbuf_has_prevent_dealloc () - Quick check if page has any scanners.
*
* return : True if page has any waiters, false otherwise.
* pgptr (in) : Page pointer.
*/
bool
pgbuf_has_prevent_dealloc (PAGE_PTR pgptr)
{
#if defined (SERVER_MODE)
PGBUF_BCB *bufptr = NULL;
assert (pgptr != NULL);
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
return pgbuf_bcb_should_avoid_deallocation (bufptr);
#else
return false;
#endif
}
void
pgbuf_peek_stats (UINT64 * fixed_cnt, UINT64 * dirty_cnt, UINT64 * lru1_cnt, UINT64 * lru2_cnt, UINT64 * lru3_cnt,
UINT64 * victim_candidates, UINT64 * avoid_dealloc_cnt, UINT64 * avoid_victim_cnt,
UINT64 * private_quota, UINT64 * private_cnt, UINT64 * alloc_bcb_waiter_high,
UINT64 * alloc_bcb_waiter_med, UINT64 * flushed_bcbs_waiting_direct_assign,
UINT64 * lfcq_big_prv_num, UINT64 * lfcq_prv_num, UINT64 * lfcq_shr_num)
{
PGBUF_BCB *bufptr;
int i;
int bcb_flags;
PGBUF_ZONE zone;
*fixed_cnt = 0;
*dirty_cnt = 0;
*lru1_cnt = 0;
*lru2_cnt = 0;
*lru3_cnt = 0;
*avoid_dealloc_cnt = 0;
*avoid_victim_cnt = 0;
*private_cnt = 0;
*victim_candidates = 0;
for (i = 0; i < pgbuf_Pool.num_buffers; i++)
{
bufptr = PGBUF_FIND_BCB_PTR (i);
if (get_fcnt (&bufptr->atomic_latch) > 0)
{
*fixed_cnt = *fixed_cnt + 1;
}
/* copy flags. we do not lock the bcb and we can be affected by concurrent changes. */
bcb_flags = bufptr->flags;
if (bcb_flags & PGBUF_BCB_DIRTY_FLAG)
{
*dirty_cnt = *dirty_cnt + 1;
}
zone = PGBUF_GET_ZONE (bcb_flags);
if (zone == PGBUF_LRU_1_ZONE)
{
*lru1_cnt = *lru1_cnt + 1;
}
else if (zone == PGBUF_LRU_2_ZONE)
{
*lru2_cnt = *lru2_cnt + 1;
}
else if (zone == PGBUF_LRU_3_ZONE)
{
*lru3_cnt = *lru3_cnt + 1;
}
if (pgbuf_bcb_should_avoid_deallocation (bufptr))
{
*avoid_dealloc_cnt = *avoid_dealloc_cnt + 1;
}
if (bcb_flags & PGBUF_BCB_FLUSHING_TO_DISK_FLAG)
{
*avoid_victim_cnt = *avoid_victim_cnt + 1;
}
if (zone & PGBUF_LRU_ZONE_MASK)
{
if (PGBUF_IS_PRIVATE_LRU_INDEX (bcb_flags & PGBUF_LRU_INDEX_MASK))
{
*private_cnt = *private_cnt + 1;
}
}
}
for (i = 0; i < PGBUF_TOTAL_LRU_COUNT; i++)
{
*victim_candidates = *victim_candidates + pgbuf_Pool.buf_LRU_list[i].count_vict_cand;
}
*private_quota = (UINT64) (pgbuf_Pool.quota.private_pages_ratio * pgbuf_Pool.num_buffers);
#if defined (SERVER_MODE)
*alloc_bcb_waiter_high = pgbuf_Pool.direct_victims.waiter_threads_high_priority->size ();
*alloc_bcb_waiter_med = pgbuf_Pool.direct_victims.waiter_threads_low_priority->size ();
*flushed_bcbs_waiting_direct_assign = pgbuf_Pool.flushed_bcbs->size ();
#else /* !SERVER_MODE */
*alloc_bcb_waiter_high = 0;
*alloc_bcb_waiter_med = 0;
*flushed_bcbs_waiting_direct_assign = 0;
#endif /* !SERVER_MODE */
if (pgbuf_Pool.big_private_lrus_with_victims != NULL)
{
*lfcq_big_prv_num = pgbuf_Pool.big_private_lrus_with_victims->size ();
}
if (pgbuf_Pool.private_lrus_with_victims != NULL)
{
*lfcq_prv_num = pgbuf_Pool.private_lrus_with_victims->size ();
}
*lfcq_shr_num = pgbuf_Pool.shared_lrus_with_victims->size ();
}
/*
* pgbuf_flush_control_from_dirty_ratio () - Try to control adaptive flush aggressiveness based on the
* page buffer "dirtiness".
*
* return : Suggested number to increase flush rate.
*/
int
pgbuf_flush_control_from_dirty_ratio (void)
{
static int prev_dirties_cnt = 0;
int crt_dirties_cnt = (int) pgbuf_Pool.monitor.dirties_cnt;
int desired_dirty_cnt = pgbuf_Pool.num_buffers / 2;
int adapt_flush_rate = 0;
/* If the dirty ratio is now above the desired level, try to suggest a more aggressive flush to bring it back. */
if (crt_dirties_cnt > desired_dirty_cnt)
{
/* Try to get dirties count back to dirty desired ratio. */
/* Accelerate the rate when dirties count is higher. */
int dirties_above_desired_cnt = crt_dirties_cnt - desired_dirty_cnt;
int total_above_desired_cnt = pgbuf_Pool.num_buffers - desired_dirty_cnt;
adapt_flush_rate = dirties_above_desired_cnt * dirties_above_desired_cnt / total_above_desired_cnt;
}
/* Now consider dirty growth rate. Even if page buffer dirty ratio is not yet reached, try to avoid a sharp growth.
* Flush may be not be aggressive enough and may require time to get there. In the meantime, the dirty ratio could go
* well beyond the desired ratio. */
if (crt_dirties_cnt > prev_dirties_cnt)
{
int diff = crt_dirties_cnt - prev_dirties_cnt;
/* Set a weight on the difference based on the dirty rate of buffer. */
adapt_flush_rate += diff * crt_dirties_cnt / pgbuf_Pool.num_buffers;
prev_dirties_cnt = crt_dirties_cnt;
}
return adapt_flush_rate;
}
/*
* pgbuf_rv_flush_page () - Flush page during recovery. Some changes must be flushed immediately to provide
* consistency, in case server crashes again during recovery.
*
* return : Error code.
* thread_p (in) : Thread entry.
* rcv (in) : Recovery data (VPID of page to flush).
*/
int
pgbuf_rv_flush_page (THREAD_ENTRY * thread_p, LOG_RCV * rcv)
{
PAGE_PTR page_to_flush = NULL;
VPID vpid_to_flush = VPID_INITIALIZER;
LOG_DATA_ADDR addr = LOG_DATA_ADDR_INITIALIZER;
assert (rcv->pgptr == NULL);
assert (rcv->length == sizeof (VPID));
VPID_COPY (&vpid_to_flush, (VPID *) rcv->data);
page_to_flush =
pgbuf_fix (thread_p, &vpid_to_flush, OLD_PAGE_MAYBE_DEALLOCATED, PGBUF_LATCH_WRITE, PGBUF_UNCONDITIONAL_LATCH);
if (page_to_flush == NULL)
{
/* Page no longer exist. */
er_clear ();
return NO_ERROR;
}
/* Flush page and unfix. */
/* add a log or else the end of logical system operation will complain */
log_append_empty_record (thread_p, LOG_DUMMY_GENERIC, &addr);
pgbuf_set_dirty (thread_p, page_to_flush, DONT_FREE);
pgbuf_flush (thread_p, page_to_flush, DONT_FREE);
pgbuf_unfix (thread_p, page_to_flush);
return NO_ERROR;
}
/*
* pgbuf_rv_flush_page_dump () - Dump data for recovery page flush.
*
* return : Void.
* fp (in) : Output target.
* length (in) : Length of recovery data.
* data (in) : Recovery data (VPID of page to flush).
*/
void
pgbuf_rv_flush_page_dump (FILE * fp, int length, void *data)
{
VPID vpid_to_flush = VPID_INITIALIZER;
assert (length == sizeof (VPID));
VPID_COPY (&vpid_to_flush, (VPID *) data);
fprintf (fp, "Page to flush: %d|%d. \n", vpid_to_flush.volid, vpid_to_flush.pageid);
}
/*
* pgbuf_latch_mode_str () - print latch_mode
*
* return : const char *
* latch_mode (in) :
*/
static const char *
pgbuf_latch_mode_str (PGBUF_LATCH_MODE latch_mode)
{
const char *latch_mode_str;
switch (latch_mode)
{
case PGBUF_NO_LATCH:
latch_mode_str = "No Latch";
break;
case PGBUF_LATCH_READ:
latch_mode_str = "Read";
break;
case PGBUF_LATCH_WRITE:
latch_mode_str = "Write";
break;
case PGBUF_LATCH_FLUSH:
latch_mode_str = "Flush";
break;
default:
latch_mode_str = "Fault";
break;
}
return latch_mode_str;
}
/*
* pgbuf_zone_str () - print zone info
*
* return : const char *
* zone (in) :
*/
static const char *
pgbuf_zone_str (PGBUF_ZONE zone)
{
const char *zone_str;
switch (zone)
{
case PGBUF_LRU_1_ZONE:
zone_str = "LRU_1_Zone";
break;
case PGBUF_LRU_2_ZONE:
zone_str = "LRU_2_Zone";
break;
case PGBUF_LRU_3_ZONE:
zone_str = "LRU_3_Zone";
break;
case PGBUF_INVALID_ZONE:
zone_str = "INVALID_Zone";
break;
default:
zone_str = "VOID_Zone";
break;
}
return zone_str;
}
/*
* pgbuf_consistent_str () - print consistent info
*
* return : const char *
* consistent (in) :
*/
static const char *
pgbuf_consistent_str (int consistent)
{
const char *consistent_str;
switch (consistent)
{
case PGBUF_CONTENT_GOOD:
consistent_str = "GOOD";
break;
case PGBUF_CONTENT_BAD:
consistent_str = "BAD";
break;
default:
consistent_str = "LIKELY BAD";
break;
}
return consistent_str;
}
/*
* pgbuf_get_fix_count () - Get page fix count.
*
* return : Fix count.
* pgptr (in) : Page pointer.
*/
int
pgbuf_get_fix_count (PAGE_PTR pgptr)
{
PGBUF_BCB *bufptr = NULL;
assert (pgptr != NULL);
CAST_PGPTR_TO_BFPTR (bufptr, pgptr);
return get_fcnt (&bufptr->atomic_latch);
}
/*
* pgbuf_get_hold_count () - Get hold count for current thread.
*
* return : Hold count
* thread_p (in) : Thread entry
*/
int
pgbuf_get_hold_count (THREAD_ENTRY * thread_p)
{
int me = thread_get_entry_index (thread_p);
return pgbuf_Pool.thrd_holder_info[me].num_hold_cnt;
}
/*
* pgbuf_get_page_type_for_stat () - Return the page type for current page
*
* return : page type
* pgptr (in) : pointer to a page
*/
PERF_PAGE_TYPE
pgbuf_get_page_type_for_stat (THREAD_ENTRY * thread_p, PAGE_PTR pgptr)
{
PERF_PAGE_TYPE perf_page_type;
FILEIO_PAGE *io_pgptr;
CAST_PGPTR_TO_IOPGPTR (io_pgptr, pgptr);
if ((io_pgptr->prv.ptype == PAGE_BTREE)
&& (perfmon_get_activation_flag () & PERFMON_ACTIVATION_FLAG_DETAILED_BTREE_PAGE))
{
perf_page_type = btree_get_perf_btree_page_type (thread_p, pgptr);
}
else
{
perf_page_type = (PERF_PAGE_TYPE) io_pgptr->prv.ptype;
}
return perf_page_type;
}
/*
* pgbuf_log_new_page () - log new page being created
*
* return : error code
* thread_p (in) : thread entry
* page_new (in) : new page
* data_size (in) : size of page data
* ptype_new (in) : new page type
*/
void
pgbuf_log_new_page (THREAD_ENTRY * thread_p, PAGE_PTR page_new, int data_size, PAGE_TYPE ptype_new)
{
assert (ptype_new != PAGE_UNKNOWN);
assert (page_new != NULL);
assert (data_size > 0);
log_append_undoredo_data2 (thread_p, RVPGBUF_NEW_PAGE, NULL, page_new, (PGLENGTH) ptype_new, 0, data_size, NULL,
page_new);
pgbuf_set_dirty (thread_p, page_new, DONT_FREE);
}
void
pgbuf_log_redo_new_page (THREAD_ENTRY * thread_p, PAGE_PTR page_new, int data_size, PAGE_TYPE ptype_new)
{
assert (ptype_new != PAGE_UNKNOWN);
assert (page_new != NULL);
assert (data_size > 0);
log_append_redo_data2 (thread_p, RVPGBUF_NEW_PAGE, NULL, page_new, (PGLENGTH) ptype_new, data_size, page_new);
pgbuf_set_dirty (thread_p, page_new, DONT_FREE);
}
/*
* log_redo_page () - Apply redo for changing entire page (or at least its first part).
*
* return : NO_ERROR.
* thread_p (in) : Thread entry.
* rcv (in) : Recovery data.
*/
int
pgbuf_rv_new_page_redo (THREAD_ENTRY * thread_p, LOG_RCV * rcv)
{
PAGE_TYPE set_page_type;
assert (rcv->pgptr != NULL);
assert (rcv->length >= 0);
assert (rcv->length <= DB_PAGESIZE);
if (rcv->length > 0)
{
memcpy (rcv->pgptr, rcv->data, rcv->length);
}
set_page_type = (PAGE_TYPE) rcv->offset;
if (set_page_type != PAGE_UNKNOWN)
{
pgbuf_set_page_ptype (thread_p, rcv->pgptr, set_page_type);
}
else
{
assert (false);
}
pgbuf_set_dirty (thread_p, rcv->pgptr, DONT_FREE);
return NO_ERROR;
}
/*
* pgbuf_rv_new_page_undo () - undo new page (by resetting its page type to PAGE_UNKNOWN)
*
* return : NO_ERROR
* thread_p (in) : thread entry
* rcv (in) : recovery data
*/
int
pgbuf_rv_new_page_undo (THREAD_ENTRY * thread_p, LOG_RCV * rcv)
{
pgbuf_set_page_ptype (thread_p, rcv->pgptr, PAGE_UNKNOWN);
pgbuf_set_dirty (thread_p, rcv->pgptr, DONT_FREE);
return NO_ERROR;
}
/*
* pgbuf_dealloc_page () - deallocate a page
*
* return : error code
* thread_p (in) : thread entry
* page (in) : page to deallocate
*/
void
pgbuf_dealloc_page (THREAD_ENTRY * thread_p, PAGE_PTR page_dealloc)
{
PGBUF_BCB *bcb = NULL;
PAGE_TYPE ptype;
FILEIO_PAGE_RESERVED *prv;
PGBUF_DEALLOC_UNDO_DATA udata;
char undo_data[8]; // pageid(4) + volid(2) + pyte(1) + pflag(1)
int holder_status;
/* how it works: page is "deallocated" by resetting its type to PAGE_UNKNOWN. also prepare bcb for victimization.
*
* note: the bcb used to be invalidated. but that means flushing page to disk and waiting for IO write. that may be
* too slow. if we add the bcb to the bottom of a lru list, it will be eventually flushed by flush thread and
* victimized. */
CAST_PGPTR_TO_BFPTR (bcb, page_dealloc);
assert (get_fcnt (&bcb->atomic_latch) == 1);
prv = &bcb->iopage_buffer->iopage.prv;
assert (prv->ptype != PAGE_UNKNOWN);
udata.pageid = prv->pageid;
udata.volid = prv->volid;
udata.ptype = prv->ptype;
udata.pflag = prv->pflag;
log_append_undoredo_data2 (thread_p, RVPGBUF_DEALLOC, NULL, page_dealloc, 0, sizeof (udata), 0, &udata, NULL);
PGBUF_BCB_LOCK (bcb);
#if !defined(NDEBUG)
if (bcb->iopage_buffer->iopage.prv.pflag & FILEIO_PAGE_FLAG_ENCRYPTED_MASK)
{
tde_er_log ("pgbuf_dealloc_page(): clear tde bit in pflag, VPID = %d|%d, tde_algorithm = %s\n",
VPID_AS_ARGS (&bcb->vpid), tde_get_algorithm_name (pgbuf_get_tde_algorithm (page_dealloc)));
}
#endif /* !NDEBUG */
/* set unknown type */
bcb->iopage_buffer->iopage.prv.ptype = (unsigned char) PAGE_UNKNOWN;
/* clear page flags (now only tde algorithm) */
bcb->iopage_buffer->iopage.prv.pflag = (unsigned char) 0;
/* set dirty and mark to move to the bottom of lru */
pgbuf_bcb_update_flags (thread_p, bcb, PGBUF_BCB_DIRTY_FLAG | PGBUF_BCB_MOVE_TO_LRU_BOTTOM_FLAG, 0);
holder_status = pgbuf_unlatch_thrd_holder (thread_p, bcb, NULL);
#if !defined (NDEBUG)
thread_p->get_pgbuf_tracker ().decrement (page_dealloc);
#endif // !NDEBUG
(void) pgbuf_unlatch_bcb_upon_unfix (thread_p, bcb, holder_status);
/* bufptr->mutex has been released in above function. */
}
/*
* pgbuf_rv_dealloc_redo () - redo page deallocate (by resetting page type to unknown).
*
* return : NO_ERROR
* thread_p (in) : thread entry
* rcv (in) : recovery data
*/
int
pgbuf_rv_dealloc_redo (THREAD_ENTRY * thread_p, LOG_RCV * rcv)
{
pgbuf_set_page_ptype (thread_p, rcv->pgptr, PAGE_UNKNOWN);
pgbuf_set_tde_algorithm (thread_p, rcv->pgptr, TDE_ALGORITHM_NONE, true);
pgbuf_set_dirty (thread_p, rcv->pgptr, DONT_FREE);
return NO_ERROR;
}
/*
* pgbuf_rv_dealloc_undo () - undo page deallocation. the page is validated by setting its page type back.
*
* return : error code
* thread_p (in) : thread entry
* rcv (in) : recovery data
*
* note: we had to make this function logical, because if a page is deallocated, it cannot be fixed, unless we use
* fetch type OLD_PAGE_DEALLOCATED.
*/
int
pgbuf_rv_dealloc_undo (THREAD_ENTRY * thread_p, LOG_RCV * rcv)
{
PAGE_PTR page_deallocated = NULL;
PGBUF_DEALLOC_UNDO_DATA *udata = (PGBUF_DEALLOC_UNDO_DATA *) rcv->data;
VPID vpid;
FILEIO_PAGE *iopage;
vpid.pageid = udata->pageid;
vpid.volid = udata->volid;
assert (rcv->length == sizeof (PGBUF_DEALLOC_UNDO_DATA));
assert (udata->ptype > PAGE_UNKNOWN && udata->ptype <= PAGE_LAST);
/* fix deallocated page */
page_deallocated = pgbuf_fix (thread_p, &vpid, OLD_PAGE_DEALLOCATED, PGBUF_LATCH_WRITE, PGBUF_UNCONDITIONAL_LATCH);
if (page_deallocated == NULL)
{
assert_release (false);
return ER_FAILED;
}
assert (pgbuf_get_page_ptype (thread_p, page_deallocated) == PAGE_UNKNOWN);
pgbuf_set_page_ptype (thread_p, page_deallocated, (PAGE_TYPE) udata->ptype);
CAST_PGPTR_TO_IOPGPTR (iopage, page_deallocated);
iopage->prv.pflag = udata->pflag;
#if !defined(NDEBUG)
if (iopage->prv.pflag & FILEIO_PAGE_FLAG_ENCRYPTED_MASK)
{
tde_er_log ("pgbuf_rv_dealloc_page(): reset tde bit in pflag, VPID = %d|%d, tde_algorithm = %s\n",
VPID_AS_ARGS (&vpid), tde_get_algorithm_name (pgbuf_get_tde_algorithm (page_deallocated)));
}
#endif /* !NDEBUG */
log_append_compensate_with_undo_nxlsa (thread_p, RVPGBUF_COMPENSATE_DEALLOC, &vpid, 0, page_deallocated,
sizeof (PGBUF_DEALLOC_UNDO_DATA), udata, LOG_FIND_CURRENT_TDES (thread_p),
&rcv->reference_lsa);
pgbuf_set_dirty_and_free (thread_p, page_deallocated);
return NO_ERROR;
}
/*
* pgbuf_rv_dealloc_undo_compensate () - compensation for undo page deallocation. the page is validated by setting its page type back.
*
* return : error code
* thread_p (in) : thread entry
* rcv (in) : recovery data
*
*/
int
pgbuf_rv_dealloc_undo_compensate (THREAD_ENTRY * thread_p, LOG_RCV * rcv)
{
PGBUF_DEALLOC_UNDO_DATA *udata = (PGBUF_DEALLOC_UNDO_DATA *) rcv->data;
VPID vpid;
FILEIO_PAGE *iopage;
assert (rcv->pgptr != NULL);
assert (rcv->length == sizeof (PGBUF_DEALLOC_UNDO_DATA));
assert (udata->ptype > PAGE_UNKNOWN && udata->ptype <= PAGE_LAST);
CAST_PGPTR_TO_IOPGPTR (iopage, rcv->pgptr);
pgbuf_set_page_ptype (thread_p, rcv->pgptr, (PAGE_TYPE) udata->ptype);
iopage->prv.pflag = udata->pflag;
#if !defined(NDEBUG)
if (iopage->prv.pflag & FILEIO_PAGE_FLAG_ENCRYPTED_MASK)
{
tde_er_log ("pgbuf_rv_dealloc_page(): reset tde bit in pflag, VPID = %d|%d, tde_algorithm = %s\n",
VPID_AS_ARGS (&vpid), tde_get_algorithm_name (pgbuf_get_tde_algorithm (rcv->pgptr)));
}
#endif /* !NDEBUG */
return NO_ERROR;
}
/*
* pgbuf_fix_if_not_deallocated () - fix a page if it is not deallocated. the difference compared to regulat page fix
* finding deallocated pages is expected. if the page is indeed deallocated, it will
* not fix it
*
* return : error code
* thread_p (in) : thread entry
* vpid (in) : page identifier
* latch_mode (in) : latch mode
* latch_condition (in) : latch condition
* page (out) : output fixed page if not deallocated. output NULL if deallocated.
* caller_file (in) : caller file name
* caller_line (in) : caller line
*/
int
pgbuf_fix_if_not_deallocated_with_caller (THREAD_ENTRY * thread_p, const VPID * vpid, PGBUF_LATCH_MODE latch_mode,
PGBUF_LATCH_CONDITION latch_condition, PAGE_PTR * page
#if !defined (NDEBUG)
, const char *caller_file, int caller_line, const char *caller_func
#endif
)
{
DISK_ISVALID isvalid;
int error_code = NO_ERROR;
assert (vpid != NULL && !VPID_ISNULL (vpid));
assert (page != NULL);
*page = NULL;
/* First, checks whether the file was destroyed. Such check may create performance issues.
* This function must be adapted. Thus, if the transaction has a lock on table, we can skip
* the code that checks whether the file was destroyed.
*/
isvalid = disk_is_page_sector_reserved (thread_p, vpid->volid, vpid->pageid);
if (isvalid == DISK_INVALID)
{
/* deallocated */
return NO_ERROR;
}
else if (isvalid == DISK_ERROR)
{
ASSERT_ERROR_AND_SET (error_code);
return error_code;
}
assert (isvalid == DISK_VALID);
/* is reserved */
#if defined (NDEBUG)
*page = pgbuf_fix_release (thread_p, vpid, OLD_PAGE_MAYBE_DEALLOCATED, latch_mode, latch_condition);
#else /* !NDEBUG */
*page =
pgbuf_fix_debug (thread_p, vpid, OLD_PAGE_MAYBE_DEALLOCATED, latch_mode, latch_condition, caller_file, caller_line,
caller_func);
#endif /* !NDEBUG */
if (*page == NULL && !log_is_in_crash_recovery_and_not_yet_completes_redo ())
{
ASSERT_ERROR_AND_SET (error_code);
if (error_code == ER_PB_BAD_PAGEID)
{
/* deallocated */
er_clear ();
error_code = NO_ERROR;
}
}
return error_code;
}
#if defined (SERVER_MODE)
/*
* pgbuf_keep_victim_flush_thread_running () - keep flush thread running
*
* return : true to keep flush thread running, false otherwise
*/
bool
pgbuf_keep_victim_flush_thread_running (void)
{
return (pgbuf_is_any_thread_waiting_for_direct_victim () || pgbuf_is_hit_ratio_low ());
}
#endif /* SERVER_MDOE */
/*
* pgbuf_assign_direct_victim () - try to assign bcb directly to a thread waiting for victim. bcb must be a valid victim
* candidate
*
* return : true if bcb was assigned directly as victim, false otherwise
* thread_p (in) : thread entry
* bcb (in) : bcb to assign as victim
*/
STATIC_INLINE bool
pgbuf_assign_direct_victim (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
{
#if defined (SERVER_MODE)
THREAD_ENTRY *waiter_thread = NULL;
PERF_UTIME_TRACKER timetr;
PERF_UTIME_TRACKER_START (thread_p, &timetr);
/* must hold bcb mutex and victimization should be possible. the only victim-candidate invalidating flag allowed here
* is PGBUF_BCB_FLUSHING_TO_DISK_FLAG (because flush also calls this). */
assert (!pgbuf_bcb_is_direct_victim (bcb));
assert (!pgbuf_bcb_is_invalid_direct_victim (bcb));
assert (!pgbuf_bcb_is_dirty (bcb));
assert (!pgbuf_is_bcb_fixed_by_any (bcb, true));
PGBUF_BCB_CHECK_OWN (bcb);
/* is flushing is expected, since this is called from flush too. caller should make sure no other case should get
* here with is flushing true. */
/* if marked as victim candidate, we are sorry for the one that marked it. we'll override the flag. */
/* do we have any waiter threads? */
while (pgbuf_get_thread_waiting_for_direct_victim (waiter_thread))
{
assert (waiter_thread != NULL);
thread_lock_entry (waiter_thread);
if (waiter_thread->resume_status != THREAD_ALLOC_BCB_SUSPENDED)
{
/* it is not waiting for us anymore */
thread_unlock_entry (waiter_thread);
continue;
}
/* wakeup suspended thread */
thread_wakeup_already_had_mutex (waiter_thread, THREAD_ALLOC_BCB_RESUMED);
/* assign bcb to thread */
pgbuf_bcb_update_flags (thread_p, bcb, PGBUF_BCB_VICTIM_DIRECT_FLAG, PGBUF_BCB_FLUSHING_TO_DISK_FLAG);
pgbuf_Pool.direct_victims.bcb_victims[waiter_thread->index] = bcb;
thread_unlock_entry (waiter_thread);
PERF_UTIME_TRACKER_TIME (thread_p, &timetr, PSTAT_PB_ASSIGN_DIRECT_BCB);
/* bcb was assigned */
return true;
}
PERF_UTIME_TRACKER_TIME (thread_p, &timetr, PSTAT_PB_ASSIGN_DIRECT_BCB);
#endif /* SERVER_MODE */
/* no waiting threads */
return false;
}
#if defined (SERVER_MODE)
/*
* pgbuf_assign_flushed_pages () - assign flushed pages directly. or just mark them as flushed if it cannot be assigned.
*
* return : void
* thread_p (in) : thread entry
*/
bool
pgbuf_assign_flushed_pages (THREAD_ENTRY * thread_p)
{
PGBUF_BCB *bcb_flushed = NULL;
bool detailed_perf = perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION);
bool not_empty = false;
/* invalidation flag for direct victim assignment: any flag invalidating victim candidates, except is flushing flag */
int invalidate_flag = (PGBUF_BCB_INVALID_VICTIM_CANDIDATE_MASK & (~PGBUF_BCB_FLUSHING_TO_DISK_FLAG));
/* consume all flushed bcbs queue */
while (pgbuf_Pool.flushed_bcbs->consume (bcb_flushed))
{
not_empty = true;
/* we need to lock mutex */
PGBUF_BCB_LOCK (bcb_flushed);
if ((bcb_flushed->flags & invalidate_flag) != 0)
{
/* dirty bcb is not a valid victim */
}
else if (pgbuf_is_bcb_fixed_by_any (bcb_flushed, true))
{
/* bcb is fixed. we cannot assign it as victim */
}
else if (!PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (bcb_flushed))
{
/* bcb is hot. don't assign it as victim */
}
else if (PGBUF_IS_PRIVATE_LRU_INDEX (pgbuf_bcb_get_lru_index (bcb_flushed))
&& !PGBUF_LRU_LIST_IS_OVER_QUOTA (pgbuf_lru_list_from_bcb (bcb_flushed)))
{
/* bcb belongs to a private list under quota. give it a chance. */
}
else if (pgbuf_assign_direct_victim (thread_p, bcb_flushed))
{
/* assigned directly */
if (detailed_perf)
{
perfmon_inc_stat (thread_p, PSTAT_PB_VICTIM_ASSIGN_DIRECT_FLUSH);
}
}
else
{
/* not assigned directly */
assert (!pgbuf_bcb_is_direct_victim (bcb_flushed));
/* could not assign it directly. there must be no waiters */
}
/* make sure bcb is no longer marked as flushing */
pgbuf_bcb_mark_was_flushed (thread_p, bcb_flushed);
/* wakeup thread waiting for flush */
if (bcb_flushed->next_wait_thrd != NULL)
{
pgbuf_wake_flush_waiters (thread_p, bcb_flushed);
}
PGBUF_BCB_UNLOCK (bcb_flushed);
}
return not_empty;
}
/*
* pgbuf_get_thread_waiting_for_direct_victim () - get one of the threads waiting
*
* return : true if got thread, false otherwise
* waiting_thread_out (out) : output thread waiting for victim
*/
STATIC_INLINE bool
pgbuf_get_thread_waiting_for_direct_victim (REFPTR (THREAD_ENTRY, waiting_thread_out))
{
static INT64 count = 0;
INT64 my_count = ATOMIC_INC_64 (&count, 1);
/* every now and then, force getting waiting threads from queues with lesser priority */
if (my_count % 4 == 0)
{
if (pgbuf_Pool.direct_victims.waiter_threads_low_priority->consume (waiting_thread_out))
{
return true;
}
}
/* try queue in their priority order */
if (pgbuf_Pool.direct_victims.waiter_threads_high_priority->consume (waiting_thread_out))
{
return true;
}
if (pgbuf_Pool.direct_victims.waiter_threads_low_priority->consume (waiting_thread_out))
{
return true;
}
return false;
}
/*
* pgbuf_get_direct_victim () - get victim assigned directly.
*
* return : pointer to victim bcb
* thread_p (in) : thread entry
*/
STATIC_INLINE PGBUF_BCB *
pgbuf_get_direct_victim (THREAD_ENTRY * thread_p)
{
PGBUF_BCB *bcb =
(PGBUF_BCB *) ATOMIC_TAS_ADDR (&pgbuf_Pool.direct_victims.bcb_victims[thread_p->index], (PGBUF_BCB *) NULL);
int lru_idx;
assert (bcb != NULL);
PGBUF_BCB_LOCK (bcb);
if (pgbuf_bcb_is_invalid_direct_victim (bcb))
{
/* somebody fixed the page again. */
pgbuf_bcb_update_flags (thread_p, bcb, 0, PGBUF_BCB_INVALIDATE_DIRECT_VICTIM_FLAG);
PGBUF_BCB_UNLOCK (bcb);
return NULL;
}
assert (pgbuf_bcb_is_direct_victim (bcb));
/* clear direct victim flag */
pgbuf_bcb_update_flags (thread_p, bcb, 0, PGBUF_BCB_VICTIM_DIRECT_FLAG);
if (!pgbuf_is_bcb_victimizable (bcb, true))
{
/* should not happen */
assert (false);
PGBUF_BCB_UNLOCK (bcb);
return NULL;
}
switch (pgbuf_bcb_get_zone (bcb))
{
case PGBUF_VOID_ZONE:
break;
case PGBUF_INVALID_ZONE:
/* should not be here */
assert (false);
break;
default:
/* lru zones */
assert (PGBUF_IS_BCB_IN_LRU (bcb));
lru_idx = pgbuf_bcb_get_lru_index (bcb);
/* remove bcb from lru list */
pgbuf_lru_remove_bcb (thread_p, bcb);
/* add to AOUT */
pgbuf_add_vpid_to_aout_list (thread_p, &bcb->vpid, lru_idx);
break;
}
assert (pgbuf_bcb_get_zone (bcb) == PGBUF_VOID_ZONE);
return bcb;
}
/*
* pgbuf_is_any_thread_waiting_for_direct_victim () - is any thread waiting to allocate bcb?
*
* return : true/false
*/
STATIC_INLINE bool
pgbuf_is_any_thread_waiting_for_direct_victim (void)
{
return (!pgbuf_Pool.direct_victims.waiter_threads_high_priority->is_empty ()
|| !pgbuf_Pool.direct_victims.waiter_threads_low_priority->is_empty ());
}
#endif /* SERVER_MODE */
/*
* pgbuf_lru_increment_victim_candidates () - increment lru list victim candidate counter
*
* return : void
* lru_list (in) : lru list
*/
STATIC_INLINE void
pgbuf_lru_add_victim_candidate (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, PGBUF_BCB * bcb)
{
PGBUF_BCB *old_victim_hint;
int list_tick;
/* first, let's update the victim hint. */
/* We don't own the LRU mutex here, so after we read the victim_hint, another thread may change that BCB,
* or the victim_hint pointer itself.
* All changes of lru_list->victim_hint, must be precedeed by changing the new hint BCB to LRU3 zone, the checks must
* be repetead here in the same sequence:
* 1. read lru_list->victim_hint
* 2. stop if old_victim_hint is still in LRU3 and is older than proposed to be hint
* 3. atomically change the hint
* (old_victim_hint may suffer other changes including relocating to another LRU, this is protected by the atomic op)
*/
do
{
/* replace current victim hint only if this candidate is better. that is if its age in zone 3 is greater that of
* current hint's */
old_victim_hint = lru_list->victim_hint;
list_tick = lru_list->tick_lru3;
if (old_victim_hint != NULL && PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (old_victim_hint)
&& (PGBUF_AGE_DIFF (old_victim_hint->tick_lru3, list_tick) > PGBUF_AGE_DIFF (bcb->tick_lru3, list_tick)))
{
/* current hint is older. */
break;
}
/* compare & swap. if it fails, the hint must have been updated by someone else (it is possible even if we hold
* lru and bcb mutexes, see pgbuf_set_dirty). we try until we succeed changing the hint or until the current hint
* is better. */
}
while (!ATOMIC_CAS_ADDR (&lru_list->victim_hint, old_victim_hint, bcb));
/* update victim counter. */
/* add to lock-free circular queue so victimizers can find it... if this is not a private list under quota. */
ATOMIC_INC_32 (&lru_list->count_vict_cand, 1);
if (PGBUF_IS_SHARED_LRU_INDEX (lru_list->index) || PGBUF_LRU_LIST_IS_OVER_QUOTA (lru_list))
{
if (pgbuf_lfcq_add_lru_with_victims (lru_list)
&& perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION))
{
/* added to queue of lru lists having victims. */
}
}
}
/*
* pgbuf_lru_decrement_victim_candidates () - decrement lru list victim candidate counter
*
* return : void
* lru_list (in) : lru list
*/
STATIC_INLINE void
pgbuf_lru_remove_victim_candidate (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, PGBUF_BCB * bcb)
{
/* first update victim counter */
if (ATOMIC_INC_32 (&lru_list->count_vict_cand, -1) == 0)
{
/* we cannot remove an entry from lock-free circular queue easily. we just hope that this does not happen too
* often. do nothing here. */
}
}
/*
* pgbuf_lru_advance_victim_hint () - invalidate bcb_prev_hint as victim hint and advance to bcb_new_hint (if possible).
* in the case we'd reset hint to NULL, but we know victim candidates still exist,
* hint is set to list bottom.
*
* return : void
* thread_p (in) : thread entry
* lru_list (in) : LRU list
* bcb_prev_hint (in) : bcb being invalidated as hint
* bcb_new_hint (in) : new desired hint (can be adjusted to NULL or bottom)
* was_vict_count_updated (in) : was victim count updated? (false if bcb_prev_hint is still counted as victim candidate)
*/
STATIC_INLINE void
pgbuf_lru_advance_victim_hint (THREAD_ENTRY * thread_p, PGBUF_LRU_LIST * lru_list, PGBUF_BCB * bcb_prev_hint,
PGBUF_BCB * bcb_new_hint, bool was_vict_count_updated)
{
PGBUF_BCB *new_victim_hint = NULL;
/* note: caller must have lock on lru list! */
/* todo: add watchers on lru list mutexes */
/* new victim hint should be either NULL or in the victimization zone */
new_victim_hint = (bcb_new_hint != NULL && PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (bcb_new_hint)) ? bcb_new_hint : NULL;
/* restart from bottom if hint is NULL but we have victim candidates */
new_victim_hint = ((new_victim_hint == NULL && lru_list->count_vict_cand > (was_vict_count_updated ? 0 : 1))
? lru_list->bottom : new_victim_hint);
new_victim_hint = ((new_victim_hint != NULL && PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (new_victim_hint))
? new_victim_hint : NULL);
/* update hint (if it was not already updated) */
assert (new_victim_hint == NULL || pgbuf_bcb_get_lru_index (new_victim_hint) == lru_list->index);
if (ATOMIC_CAS_ADDR (&lru_list->victim_hint, bcb_prev_hint, new_victim_hint))
{
/* updated hint */
}
assert (lru_list->victim_hint == NULL || PGBUF_IS_BCB_IN_LRU_VICTIM_ZONE (lru_list->victim_hint));
}
/*
* pgbuf_bcb_update_flags () - update bcb flags (not zone and not lru index)
*
* return : void
* bcb (in) : bcb
* set_flags (in) : flags to set
* clear_flags (in) : flags to clear
*
* note: this makes sure the bcb flags field (which is actually flags + zone + lru index) is modified atomically. it
* also handles changes of victim candidates.
*/
STATIC_INLINE void
pgbuf_bcb_update_flags (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int set_flags, int clear_flags)
{
int old_flags;
int new_flags;
bool old_dirty, new_dirty;
/* sanity checks */
assert (bcb != NULL);
assert ((set_flags & (~PGBUF_BCB_FLAGS_MASK)) == 0);
assert ((clear_flags & (~PGBUF_BCB_FLAGS_MASK)) == 0);
/* update flags by making sure that other flags + zone + lru_index are not modified. */
do
{
old_flags = bcb->flags;
new_flags = old_flags | set_flags;
new_flags = new_flags & (~clear_flags);
if (old_flags == new_flags)
{
/* no changes are required. */
return;
}
}
while (!ATOMIC_CAS_32 (&bcb->flags, old_flags, new_flags));
if (PGBUF_GET_ZONE (old_flags) == PGBUF_LRU_3_ZONE)
{
/* bcb is in lru zone that can be victimized. some flags invalidate the victimization candidacy of a bcb;
* therefore we need to check if the bcb status regarding victimization is changed. */
bool is_old_invalid_victim_candidate = (old_flags & PGBUF_BCB_INVALID_VICTIM_CANDIDATE_MASK) != 0;
bool is_new_invalid_victim_candidate = (new_flags & PGBUF_BCB_INVALID_VICTIM_CANDIDATE_MASK) != 0;
PGBUF_LRU_LIST *lru_list;
lru_list = pgbuf_lru_list_from_bcb (bcb);
if (is_old_invalid_victim_candidate && !is_new_invalid_victim_candidate)
{
/* bcb has become a victim candidate */
pgbuf_lru_add_victim_candidate (thread_p, lru_list, bcb);
}
else if (!is_old_invalid_victim_candidate && is_new_invalid_victim_candidate)
{
/* bcb is no longer a victim candidate */
pgbuf_lru_remove_victim_candidate (thread_p, lru_list, bcb);
}
else
{
/* bcb status remains the same */
}
}
old_dirty = (old_flags & PGBUF_BCB_DIRTY_FLAG) != 0;
new_dirty = (new_flags & PGBUF_BCB_DIRTY_FLAG) != 0;
if (old_dirty && !new_dirty)
{
/* cleared dirty flag. */
ATOMIC_INC_64 (&pgbuf_Pool.monitor.dirties_cnt, -1);
}
else if (!old_dirty && new_dirty)
{
/* added dirty flag */
ATOMIC_INC_64 (&pgbuf_Pool.monitor.dirties_cnt, 1);
}
assert (pgbuf_Pool.monitor.dirties_cnt >= 0 && pgbuf_Pool.monitor.dirties_cnt <= pgbuf_Pool.num_buffers);
}
/*
* pgbuf_bcb_change_zone () - change the zone and lru index of bcb, but keep the bcb flags. also handles the zone
* counters, victim counter and victim hint for lru lists.
*
* return : void
* bcb (in) : bcb
* lru_idx (in) : lru index (0 if not in any lru zone)
* zone (in) : zone
*
* this is called whenever the bcb is moved from a logical zone to another. possible transitions:
*
* FIXME: correct the following description
* 1. get from invalid list invalid => void (bcb is locked)
* 2. get victim lru => void (list & bcb are locked)
* 3. unfix void/lru => lru (list & bcb are locked)
* 4. lru adjust zones lru => lru (list is locked)
*
* note: two simultaneous change zones on the same bcb should not be possible. the only case when bcb is not locked
* is case 4, however list is locked. other possible cases that can call change zone on same bcb must have lock
* on lru mutex.
*
* note: bcb->flags is changed here and simultaneous calls of pgbuf_bcb_update_flags is possible. in some cases, the
* flags may change even with no mutex (pgbuf_set_dirty). since we have to handle victim counter and hint for
* lru lists, we must do atomic operations to modify the zone, and keep previous and new flag values. based on
* these flags, we then update lru zone counters, lru victim counter and lru victim hint. lru zone counters can
* only be modified by other calls pgbuf_bcb_change_zone in same lru and are protected by lru mutex, so they can
* be modified without atomic operations.
*/
STATIC_INLINE void
pgbuf_bcb_change_zone (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, int new_lru_idx, PGBUF_ZONE new_zone)
{
int old_flags;
int new_flags;
int new_zone_idx = PGBUF_MAKE_ZONE (new_lru_idx, new_zone);
bool is_valid_victim_candidate;
PGBUF_LRU_LIST *lru_list;
/* note: make sure the zones from and to are changing are blocked */
/* sanity checks */
assert (bcb != NULL);
assert (new_lru_idx == 0 || new_zone == PGBUF_LRU_1_ZONE || new_zone == PGBUF_LRU_2_ZONE
|| new_zone == PGBUF_LRU_3_ZONE);
/* update bcb->flags. make sure we are only changing the values for zone and lru index, but we preserve the flags. */
do
{
/* get current value of bcb->flags */
old_flags = bcb->flags;
/* now set new flags to same bcb flags + new zone & lru index */
new_flags = (old_flags & PGBUF_BCB_FLAGS_MASK) | new_zone_idx;
/* compare & swap. if we fail, we have to try again. until we succeed. */
}
while (!ATOMIC_CAS_32 (&bcb->flags, old_flags, new_flags));
/* was bcb a valid victim candidate (we only consider flags, not fix counters or zone)? note that this is still true
* after the change of zone. */
is_valid_victim_candidate = (old_flags & PGBUF_BCB_INVALID_VICTIM_CANDIDATE_MASK) == 0;
if (old_flags & PGBUF_LRU_ZONE_MASK)
{
/* bcb was in a lru list. we need to update zone counters. */
int lru_idx = PGBUF_GET_LRU_INDEX (old_flags);
lru_list = PGBUF_GET_LRU_LIST (lru_idx);
/* hint should have been changed already if the BCB was in LRU3; otherwise (if downgraded, we may expect that
* victim hint is changed by other thread (checkpoint->pgbuf_bcb_update_flags) */
assert (lru_list->victim_hint != bcb || PGBUF_GET_ZONE (old_flags) != PGBUF_LRU_3_ZONE);
if (PGBUF_IS_SHARED_LRU_INDEX (PGBUF_GET_LRU_INDEX (old_flags)))
{
ATOMIC_INC_32 (&pgbuf_Pool.monitor.lru_shared_pgs_cnt, -1);
}
switch (PGBUF_GET_ZONE (old_flags))
{
case PGBUF_LRU_1_ZONE:
lru_list->count_lru1--;
break;
case PGBUF_LRU_2_ZONE:
lru_list->count_lru2--;
break;
case PGBUF_LRU_3_ZONE:
lru_list->count_lru3--;
if (is_valid_victim_candidate)
{
/* bcb was a valid victim and in the zone that could be victimized. update victim counter & hint */
pgbuf_lru_remove_victim_candidate (thread_p, lru_list, bcb);
}
break;
default:
assert (false);
break;
}
}
if (new_zone & PGBUF_LRU_ZONE_MASK)
{
lru_list = PGBUF_GET_LRU_LIST (new_lru_idx);
if (PGBUF_IS_SHARED_LRU_INDEX (PGBUF_GET_LRU_INDEX (new_flags)))
{
ATOMIC_INC_32 (&pgbuf_Pool.monitor.lru_shared_pgs_cnt, 1);
}
switch (new_zone)
{
case PGBUF_LRU_1_ZONE:
lru_list->count_lru1++;
break;
case PGBUF_LRU_2_ZONE:
lru_list->count_lru2++;
break;
case PGBUF_LRU_3_ZONE:
lru_list->count_lru3++;
if (is_valid_victim_candidate)
{
pgbuf_lru_add_victim_candidate (thread_p, lru_list, bcb);
}
break;
default:
assert (false);
break;
}
}
}
/*
* pgbuf_bcb_get_zone () - get zone of bcb
*
* return : PGBUF_ZONE
* bcb (in) : bcb
*/
STATIC_INLINE PGBUF_ZONE
pgbuf_bcb_get_zone (const PGBUF_BCB * bcb)
{
return PGBUF_GET_ZONE (bcb->flags);
}
/*
* pgbuf_bcb_get_lru_index () - get lru index of bcb. make sure bcb is in lru zones.
*
* return : lru index
* bcb (in) : bcb
*/
STATIC_INLINE int
pgbuf_bcb_get_lru_index (const PGBUF_BCB * bcb)
{
assert (PGBUF_IS_BCB_IN_LRU (bcb));
return PGBUF_GET_LRU_INDEX (bcb->flags);
}
/*
* pgbuf_bcb_is_dirty () - is bcb dirty?
*
* return : true/false
* bcb (in) : bcb
*/
STATIC_INLINE bool
pgbuf_bcb_is_dirty (const PGBUF_BCB * bcb)
{
return (bcb->flags & PGBUF_BCB_DIRTY_FLAG) != 0;
}
/*
* pgbuf_bcb_set_dirty () - set dirty flag to bcb
*
* return : void
* bcb (in) : bcb
*/
STATIC_INLINE void
pgbuf_bcb_set_dirty (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
{
/* set dirty flag and clear none */
/* note: we usually use pgbuf_bcb_update_flags function. we do an exception for pgbuf_bcb_set_dirty to since it is the
* most used case and the code should be as optimal as possible. */
int old_flags;
do
{
old_flags = bcb->flags;
if (old_flags & PGBUF_BCB_DIRTY_FLAG)
{
/* already dirty */
return;
}
}
while (!ATOMIC_CAS_32 (&bcb->flags, old_flags, old_flags | PGBUF_BCB_DIRTY_FLAG));
/* was changed to dirty */
ATOMIC_INC_64 (&pgbuf_Pool.monitor.dirties_cnt, 1);
assert (pgbuf_Pool.monitor.dirties_cnt >= 0 && pgbuf_Pool.monitor.dirties_cnt <= pgbuf_Pool.num_buffers);
if (PGBUF_GET_ZONE (old_flags) == PGBUF_LRU_3_ZONE && (old_flags & PGBUF_BCB_INVALID_VICTIM_CANDIDATE_MASK) == 0)
{
/* invalidate victim */
pgbuf_lru_remove_victim_candidate (thread_p, pgbuf_lru_list_from_bcb (bcb), bcb);
}
}
/*
* pgbuf_bcb_clear_dirty () - clear dirty flag from bcb
*
* return : void
* bcb (in) : bcb
*/
STATIC_INLINE void
pgbuf_bcb_clear_dirty (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
{
/* set no flag and clear dirty */
pgbuf_bcb_update_flags (thread_p, bcb, 0, PGBUF_BCB_DIRTY_FLAG);
}
/*
* pgbuf_bcb_mark_is_flushing () - mark page is being flushed. dirty flag is also cleared because while the page is
* flushed to disk, another thread may fix the page and modify it. the new change must
* be tracked.
*
* return : void
* bcb (in) : bcb
*/
STATIC_INLINE bool
pgbuf_bcb_mark_is_flushing (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
{
if (pgbuf_bcb_is_dirty (bcb))
{
/* set flushing flag and clear dirty */
pgbuf_bcb_update_flags (thread_p, bcb, PGBUF_BCB_FLUSHING_TO_DISK_FLAG,
PGBUF_BCB_DIRTY_FLAG | PGBUF_BCB_ASYNC_FLUSH_REQ);
return true;
}
else
{
pgbuf_bcb_update_flags (thread_p, bcb, PGBUF_BCB_FLUSHING_TO_DISK_FLAG, PGBUF_BCB_ASYNC_FLUSH_REQ);
return false;
}
}
/*
* pgbuf_bcb_mark_was_flushed () - mark page was flushed to disk
*
* return : void
* bcb (in) : bcb
*/
STATIC_INLINE void
pgbuf_bcb_mark_was_flushed (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb)
{
/* set no flag and clear flushing */
pgbuf_bcb_update_flags (thread_p, bcb, 0, PGBUF_BCB_FLUSHING_TO_DISK_FLAG);
}
/*
* pgbuf_bcb_mark_was_not_flushed () - page flush failed
*
* return : void
* bcb (in) : bcb
* mark_dirty(in): true if BCB needs to be marked dirty
*/
STATIC_INLINE void
pgbuf_bcb_mark_was_not_flushed (THREAD_ENTRY * thread_p, PGBUF_BCB * bcb, bool mark_dirty)
{
/* set dirty flag and clear flushing */
pgbuf_bcb_update_flags (thread_p, bcb, mark_dirty ? PGBUF_BCB_DIRTY_FLAG : 0, PGBUF_BCB_FLUSHING_TO_DISK_FLAG);
}
/*
* pgbuf_bcb_is_flushing () - is page being flushed to disk?
*
* return : true/false
* bcb (in) : bcb
*/
STATIC_INLINE bool
pgbuf_bcb_is_flushing (const PGBUF_BCB * bcb)
{
return (bcb->flags & PGBUF_BCB_FLUSHING_TO_DISK_FLAG) != 0;
}
/*
* pgbuf_bcb_is_direct_victim () - is bcb assigned as victim directly?
*
* return : true/false
* bcb (in) : bcb
*/
STATIC_INLINE bool
pgbuf_bcb_is_direct_victim (const PGBUF_BCB * bcb)
{
return (bcb->flags & PGBUF_BCB_VICTIM_DIRECT_FLAG) != 0;
}
/*
* pgbuf_bcb_is_invalid_direct_victim () - is bcb assigned as victim directly, but invalidated after?
*
* return : true/false
* bcb (in) : bcb
*/
STATIC_INLINE bool
pgbuf_bcb_is_invalid_direct_victim (const PGBUF_BCB * bcb)
{
return (bcb->flags & PGBUF_BCB_INVALIDATE_DIRECT_VICTIM_FLAG) != 0;
}
/*
* pgbuf_bcb_is_async_flush_request () - is bcb async flush requested?
*
* return : true/false
* bcb (in) : bcb
*/
STATIC_INLINE bool
pgbuf_bcb_is_async_flush_request (const PGBUF_BCB * bcb)
{
return (bcb->flags & PGBUF_BCB_ASYNC_FLUSH_REQ) != 0;
}
/*
* pgbuf_bcb_should_be_moved_to_bottom_lru () - is bcb supposed to be moved to the bottom of lru?
*
* return : true/false
* bcb (in) : bcb
*/
STATIC_INLINE bool
pgbuf_bcb_should_be_moved_to_bottom_lru (const PGBUF_BCB * bcb)
{
return (bcb->flags & PGBUF_BCB_MOVE_TO_LRU_BOTTOM_FLAG) != 0;
}
/*
* pgbuf_set_to_vacuum () - notify that page will likely be accessed by vacuum
*
* return : void
* thread_p (in) : thread entry
* page (in) : page
*/
void
pgbuf_notify_vacuum_follows (THREAD_ENTRY * thread_p, PAGE_PTR page)
{
PGBUF_BCB *bcb;
CAST_PGPTR_TO_BFPTR (bcb, page);
pgbuf_bcb_update_flags (thread_p, bcb, PGBUF_BCB_TO_VACUUM_FLAG, 0);
}
/*
* pgbuf_bcb_is_flushing () - is page going to be accessed by vacuum?
*
* return : true/false
* bcb (in) : bcb
*/
STATIC_INLINE bool
pgbuf_bcb_is_to_vacuum (const PGBUF_BCB * bcb)
{
return (bcb->flags & PGBUF_BCB_TO_VACUUM_FLAG) != 0;
}
/*
* pgbuf_bcb_avoid_victim () - should bcb be avoid for victimization?
*
* return : true/false
* bcb (in) : bcb
*
* note: no flag that invalidates a bcb victim candidacy
*/
STATIC_INLINE bool
pgbuf_bcb_avoid_victim (const PGBUF_BCB * bcb)
{
return (bcb->flags & PGBUF_BCB_INVALID_VICTIM_CANDIDATE_MASK) != 0;
}
/*
* pgbuf_bcb_get_pool_index () - get bcb pool index
*
* return : pool index
* bcb (in) : BCB
*/
STATIC_INLINE int
pgbuf_bcb_get_pool_index (const PGBUF_BCB * bcb)
{
return (int) (bcb - pgbuf_Pool.BCB_table);
}
/*
* pgbuf_bcb_register_avoid_deallocation () - avoid deallocating bcb's page.
*
* return : void
* bcb (in) : bcb
*/
STATIC_INLINE void
pgbuf_bcb_register_avoid_deallocation (PGBUF_BCB * bcb)
{
assert ((bcb->count_fix_and_avoid_dealloc & 0x00008000) == 0);
(void) ATOMIC_INC_32 (&bcb->count_fix_and_avoid_dealloc, 1);
}
/*
* pgbuf_bcb_unregister_avoid_deallocation () - avoiding page deallocation no longer required
*
* return : void
* bcb (in) : bcb
*/
STATIC_INLINE void
pgbuf_bcb_unregister_avoid_deallocation (PGBUF_BCB * bcb)
{
int count_crt;
do
{
/* get bcb->count_fix_and_avoid_dealloc (volatile) */
count_crt = bcb->count_fix_and_avoid_dealloc;
assert ((count_crt & 0x00008000) == 0);
if ((count_crt & PGBUF_BCB_AVOID_DEALLOC_MASK) > 0)
{
/* we can decrement counter */
}
else
{
/* interestingly enough, this case can happen. how?
*
* well, pgbuf_ordered_fix may be forced to unfix all pages currently held by transaction to fix a new page.
* all pages that are "less" than new page are marked to avoid deallocation and unfixed. then transaction is
* blocked on latching new page, which may take a while, pages previously unfixed can be victimized.
* when pgbuf_ordered_fix tries to fix back these pages, it will load them from disk and tadaa, the avoid
* deallocation count is 0. so we expect the case.
*
* note: avoid deallocation count is supposed to prevent vacuum workers from deallocating these pages.
* so, victimizing a bcb marked to avoid deallocation is not perfectly safe. however, the likelihood of
* page really getting deallocated is ... almost zero. the alternative of avoiding victimization when
* bcb's are marked for deallocation is much more complicated and poses serious risks (what if we leak
* the counter and prevent bcb from being victimized indefinitely?). so, we prefer the existing risks.
*/
er_log_debug (ARG_FILE_LINE,
"pgbuf_bcb_unregister_avoid_deallocation: bcb %p, vpid = %d|%d was probably victimized.\n",
bcb, VPID_AS_ARGS (&bcb->vpid));
break;
}
}
while (!ATOMIC_CAS_32 (&bcb->count_fix_and_avoid_dealloc, count_crt, count_crt - 1));
}
/*
* pgbuf_bcb_should_avoid_deallocation () - should avoid deallocating page?
*
* return : true/false
* bcb (in) : bcb
*/
STATIC_INLINE bool
pgbuf_bcb_should_avoid_deallocation (const PGBUF_BCB * bcb)
{
assert (bcb->count_fix_and_avoid_dealloc >= 0);
assert ((bcb->count_fix_and_avoid_dealloc & 0x00008000) == 0);
return (bcb->count_fix_and_avoid_dealloc & PGBUF_BCB_AVOID_DEALLOC_MASK) != 0;
}
/*
* pgbuf_bcb_check_and_reset_fix_and_avoid_dealloc () - check avoid deallocation is 0 and reset the whole bcb field.
*
* return : void
* bcb (in) : bcb
* file (in) : caller file
* line (in) : caller line
*
* note: avoid deallocation is allowed to be non-zero due to pgbuf_ordered_fix and the possibility of victimizing its
* bcb. avoid crashing the server and just issue a warning.
*/
STATIC_INLINE void
pgbuf_bcb_check_and_reset_fix_and_avoid_dealloc (PGBUF_BCB * bcb, const char *file, int line)
{
if (pgbuf_bcb_should_avoid_deallocation (bcb))
{
er_log_debug (file, line, "warning: bcb %p, vpid = %d|%d, should not have avoid deallocation marker.\n",
bcb, VPID_AS_ARGS (&bcb->vpid));
}
bcb->count_fix_and_avoid_dealloc = 0;
}
/*
* pgbuf_bcb_register_fix () - register page fix
*
* return : void
* bcb (in) : bcb
*/
STATIC_INLINE void
pgbuf_bcb_register_fix (PGBUF_BCB * bcb)
{
/* note: we only register to detect hot pages. once we hit the threshold, we are no longer required to fix it. */
if (bcb->count_fix_and_avoid_dealloc < (PGBUF_FIX_COUNT_THRESHOLD << PGBUF_BCB_COUNT_FIX_SHIFT_BITS))
{
#if !defined (NDEBUG)
int newval =
#endif /* !NDEBUG */
ATOMIC_INC_32 (&bcb->count_fix_and_avoid_dealloc, 1 << PGBUF_BCB_COUNT_FIX_SHIFT_BITS);
assert (newval >= (1 << PGBUF_BCB_COUNT_FIX_SHIFT_BITS));
assert (bcb->count_fix_and_avoid_dealloc >= (1 << PGBUF_BCB_COUNT_FIX_SHIFT_BITS));
}
}
/*
* pgbuf_bcb_is_hot () - is bcb hot (was fixed more then threshold times?)
*
* return : true/false
* bcb (in) : bcb
*/
STATIC_INLINE bool
pgbuf_bcb_is_hot (const PGBUF_BCB * bcb)
{
assert (bcb->count_fix_and_avoid_dealloc >= 0);
return bcb->count_fix_and_avoid_dealloc >= (PGBUF_FIX_COUNT_THRESHOLD << PGBUF_BCB_COUNT_FIX_SHIFT_BITS);
}
/*
* pgbuf_lfcq_add_lru_with_victims () - add lru list to queue of lists that can be victimized. this queue was designed
* so victimizers can find a list with victims quickly without iterating through
* many lists that are full.
*
* return : true if list was added, false if it was already added by someone else.
* lru_list (in) : lru list
*/
STATIC_INLINE bool
pgbuf_lfcq_add_lru_with_victims (PGBUF_LRU_LIST * lru_list)
{
int old_flags = lru_list->flags;
if (old_flags & PGBUF_LRU_VICTIM_LFCQ_FLAG)
{
/* already added. */
return false;
}
/* use compare & swap because we cannot allow two threads adding same list in queue */
if (ATOMIC_CAS_32 (&lru_list->flags, old_flags, old_flags | PGBUF_LRU_VICTIM_LFCQ_FLAG))
{
/* add to queues. we keep private and shared lists separated. */
if (PGBUF_IS_PRIVATE_LRU_INDEX (lru_list->index))
{
/* private list */
if (pgbuf_Pool.private_lrus_with_victims->produce (lru_list->index))
{
return true;
}
}
else
{
/* shared list */
if (pgbuf_Pool.shared_lrus_with_victims->produce (lru_list->index))
{
return true;
}
}
/* clear the flag */
lru_list->flags &= ~PGBUF_LRU_VICTIM_LFCQ_FLAG;
}
/* not added */
return false;
}
/*
* pgbuf_lfcq_get_victim_from_private_lru () - get a victim from a private list in lock-free queues.
*
* return : victim or NULL
* thread_p (in) : thread entry
* restricted (in) : true if victimizing is restricted to big private lists
*/
static PGBUF_BCB *
pgbuf_lfcq_get_victim_from_private_lru (THREAD_ENTRY * thread_p, bool restricted)
{
#define PERF(id) if (detailed_perf) perfmon_inc_stat (thread_p, id)
int lru_idx;
PGBUF_LRU_LIST *lru_list;
PGBUF_BCB *victim = NULL;
bool detailed_perf = perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION);
bool added_back = false;
if (pgbuf_Pool.private_lrus_with_victims == NULL)
{
return NULL;
}
assert (pgbuf_Pool.big_private_lrus_with_victims != NULL);
if (pgbuf_Pool.big_private_lrus_with_victims->consume (lru_idx))
{
/* prioritize big lists */
PERF (PSTAT_PB_LFCQ_LRU_PRV_GET_CALLS);
PERF (PSTAT_PB_LFCQ_LRU_PRV_GET_BIG);
}
else
{
if (restricted)
{
return NULL;
}
PERF (PSTAT_PB_LFCQ_LRU_PRV_GET_CALLS);
if (!pgbuf_Pool.private_lrus_with_victims->consume (lru_idx))
{
/* empty handed */
PERF (PSTAT_PB_LFCQ_LRU_PRV_GET_EMPTY);
return NULL;
}
}
assert (PGBUF_IS_PRIVATE_LRU_INDEX (lru_idx));
lru_list = PGBUF_GET_LRU_LIST (lru_idx);
if (PGBUF_LRU_LIST_COUNT (lru_list) > PBGUF_BIG_PRIVATE_MIN_SIZE
&& PGBUF_LRU_LIST_COUNT (lru_list) > 2 * lru_list->quota && lru_list->count_vict_cand > 1)
{
/* add big private lists back immediately */
if (pgbuf_Pool.big_private_lrus_with_victims->produce (lru_idx))
{
added_back = true;
}
}
/* get victim from list */
victim = pgbuf_get_victim_from_lru_list (thread_p, lru_idx);
PERF (victim != NULL ? PSTAT_PB_VICTIM_OTHER_PRIVATE_LRU_SUCCESS : PSTAT_PB_VICTIM_OTHER_PRIVATE_LRU_FAIL);
if (added_back)
{
/* already added back to queue */
return victim;
}
if (lru_list->count_vict_cand > 0 && PGBUF_LRU_LIST_IS_OVER_QUOTA (lru_list))
{
if (pgbuf_Pool.private_lrus_with_victims->produce (lru_idx))
{
return victim;
}
}
/* we're not adding the list back to the queue... so we need to reflect that in the list flags. next time when a new
* candidate is added, lru list should also be added to the queue.
*
* note: we can have a race here. candidates are 0 now and incremented before we manage to change the victim
* counter. we should not worry that much, the list will be added by pgbuf_adjust_quotas eventually.
*/
assert ((lru_list->flags & PGBUF_LRU_VICTIM_LFCQ_FLAG) != 0);
/* note: we are not using an atomic operation here, because this is the only flag and we are certain no one else
* changes it from set to cleared. however, if more flags are added, or more cases that should clear the flag,
* then consider replacing with some atomic operation. */
lru_list->flags &= ~PGBUF_LRU_VICTIM_LFCQ_FLAG;
return victim;
#undef PERF
}
/*
* pgbuf_lfcq_get_victim_from_shared_lru () - get a victim from a shared list in lock-free queues.
*
* return : victim or NULL
* thread_p (in) : thread entry
* multi_threaded (in) : true if multi-threaded system
*/
static PGBUF_BCB *
pgbuf_lfcq_get_victim_from_shared_lru (THREAD_ENTRY * thread_p, bool multi_threaded)
{
#define PERF(id) if (detailed_perf) perfmon_inc_stat (thread_p, id)
int lru_idx;
PGBUF_LRU_LIST *lru_list;
PGBUF_BCB *victim = NULL;
bool detailed_perf = perfmon_is_perf_tracking_and_active (PERFMON_ACTIVATION_FLAG_PB_VICTIMIZATION);
PERF (PSTAT_PB_LFCQ_LRU_SHR_GET_CALLS);
if (!pgbuf_Pool.shared_lrus_with_victims->consume (lru_idx))
{
/* no list has candidates! */
PERF (PSTAT_PB_LFCQ_LRU_SHR_GET_EMPTY);
return NULL;
}
/* popped a list with victim candidates from queue */
assert (PGBUF_IS_SHARED_LRU_INDEX (lru_idx));
lru_list = PGBUF_GET_LRU_LIST (lru_idx);
victim = pgbuf_get_victim_from_lru_list (thread_p, lru_idx);
PERF (victim != NULL ? PSTAT_PB_VICTIM_SHARED_LRU_SUCCESS : PSTAT_PB_VICTIM_SHARED_LRU_FAIL);
/* no victim found in first step, but flush thread ran and candidates can be found, try again */
if (victim == NULL && multi_threaded == false && lru_list->count_vict_cand > 0)
{
victim = pgbuf_get_victim_from_lru_list (thread_p, lru_idx);
PERF (victim != NULL ? PSTAT_PB_VICTIM_SHARED_LRU_SUCCESS : PSTAT_PB_VICTIM_SHARED_LRU_FAIL);
}
if ((multi_threaded || victim != NULL) && lru_list->count_vict_cand > 0)
{
/* add lru list back to queue */
if (pgbuf_Pool.shared_lrus_with_victims->produce (lru_idx))
{
return victim;
}
else
{
/* we couldn't add to queue. it usually does not happen, but a consumer can be preempted for a long time,
* temporarily creating the impression that queue is full. it will be added later, when a new victim
* candidate shows up or when adjust quota checks it. */
/* fall through */
}
}
/* we're not adding the list back to the queue... so we need to reflect that in the list flags. next time when a new
* candidate is added, lru list should also be added to the queue.
*
* note: we can have a race here. candidates are 0 now and incremented before we manage to change the victim
* counter. we should not worry that much, the list will be added by pgbuf_adjust_quotas eventually.
*/
assert ((lru_list->flags & PGBUF_LRU_VICTIM_LFCQ_FLAG) != 0);
/* note: we are not using an atomic operation here, because this is the only flag and we are certain no one else
* changes it from set to cleared. however, if more flags are added, or more cases that should clear the flag,
* then consider replacing with some atomic operation. */
lru_list->flags &= ~PGBUF_LRU_VICTIM_LFCQ_FLAG;
return victim;
#undef PERF
}
/*
* pgbuf_lru_list_from_bcb () - get lru list of bcb
*
* return : lru list
* bcb (in) : bcb
*/
STATIC_INLINE PGBUF_LRU_LIST *
pgbuf_lru_list_from_bcb (const PGBUF_BCB * bcb)
{
assert (PGBUF_IS_BCB_IN_LRU (bcb));
return PGBUF_GET_LRU_LIST (pgbuf_bcb_get_lru_index (bcb));
}
/*
* pgbuf_bcb_register_hit_for_lru () - register hit when bcb is unfixed for its current lru.
*
* return : void
* bcb (in) : BCB
*/
STATIC_INLINE void
pgbuf_bcb_register_hit_for_lru (PGBUF_BCB * bcb)
{
assert (PGBUF_IS_BCB_IN_LRU (bcb));
if (bcb->hit_age < pgbuf_Pool.quota.adjust_age)
{
pgbuf_Pool.monitor.lru_hits[pgbuf_bcb_get_lru_index (bcb)]++;
bcb->hit_age = pgbuf_Pool.quota.adjust_age;
}
}
/*
* pgbuf_is_io_stressful () - is io stressful (are pages waiting for victims?)
*
* return : true/false
*/
bool
pgbuf_is_io_stressful (void)
{
#if defined (SERVER_MODE)
/* we consider the IO stressful if threads end up waiting for victims */
return !pgbuf_Pool.direct_victims.waiter_threads_low_priority->is_empty ();
#else /* !SERVER_MODE */
return false;
#endif /* !SERVER_MODE */
}
/*
* pgbuf_is_hit_ratio_low () - is page buffer hit ratio low? currently target is set to 99.9%.
*
* return : true/false
*/
STATIC_INLINE bool
pgbuf_is_hit_ratio_low (void)
{
#define PGBUF_MIN_VICTIM_REQ 10 /* set a minimum number of requests */
#define PGBUF_DESIRED_HIT_VS_MISS_RATE 1000 /* 99.9% hit ratio */
return (pgbuf_Pool.monitor.lru_victim_req_cnt > PGBUF_MIN_VICTIM_REQ
&& pgbuf_Pool.monitor.lru_victim_req_cnt * PGBUF_DESIRED_HIT_VS_MISS_RATE >
pgbuf_Pool.monitor.fix_req_cnt.load (std::memory_order_seq_cst));
#undef PGBUF_DESIRED_HIT_VS_MISS_RATE
#undef PGBUF_MIN_VICTIM_REQ
}
#if defined (SERVER_MODE)
/*
* pgbuf_bcbmon_lock () - monitor and lock bcb mutex
*
* return : void
* bcb (in) : BCB to lock
* caller_line (in) : caller line
*/
static void
pgbuf_bcbmon_lock (PGBUF_BCB * bcb, int caller_line)
{
int index = thread_get_current_entry_index ();
PGBUF_MONITOR_BCB_MUTEX *monitor_bcb_mutex = &pgbuf_Pool.monitor.bcb_locks[index];
assert_release (pgbuf_Monitor_locks);
if (monitor_bcb_mutex->bcb != NULL)
{
/* already have a bcb mutex. we cannot lock another one unless try lock is used. */
PGBUF_ABORT_RELEASE ();
}
if (monitor_bcb_mutex->bcb_second != NULL)
{
/* already have a bcb mutex. we cannot lock another one unless try lock is used. */
PGBUF_ABORT_RELEASE ();
}
if (bcb->owner_mutex == index)
{
/* double lock */
PGBUF_ABORT_RELEASE ();
}
/* ok, we can lock */
(void) pthread_mutex_lock (&bcb->mutex);
if (bcb->owner_mutex >= 0)
{
/* somebody else has mutex? */
PGBUF_ABORT_RELEASE ();
}
monitor_bcb_mutex->bcb = bcb;
monitor_bcb_mutex->line = caller_line;
bcb->owner_mutex = index;
}
/*
* pgbuf_bcbmon_trylock () - monitor and try locking bcb mutex. do not wait if it is already locked
*
* return : try lock result
* bcb (in) : BCB to lock
* caller_line (in) : caller line
*/
static int
pgbuf_bcbmon_trylock (PGBUF_BCB * bcb, int caller_line)
{
int index = thread_get_current_entry_index ();
int rv;
PGBUF_MONITOR_BCB_MUTEX *monitor_bcb_mutex = &pgbuf_Pool.monitor.bcb_locks[index];
assert_release (pgbuf_Monitor_locks);
if (bcb->owner_mutex == index)
{
/* double lock */
PGBUF_ABORT_RELEASE ();
}
if (monitor_bcb_mutex->bcb != NULL && monitor_bcb_mutex->bcb_second != NULL)
{
/* two bcb's are already locked. */
PGBUF_ABORT_RELEASE ();
}
if (monitor_bcb_mutex->bcb != NULL && monitor_bcb_mutex->bcb == bcb)
{
/* same bcb is already locked?? */
PGBUF_ABORT_RELEASE ();
}
/* try lock */
rv = pthread_mutex_trylock (&bcb->mutex);
if (rv == 0)
{
/* success. monitor it. */
if (monitor_bcb_mutex->bcb == NULL)
{
monitor_bcb_mutex->bcb = bcb;
monitor_bcb_mutex->line = caller_line;
}
else
{
monitor_bcb_mutex->bcb_second = bcb;
monitor_bcb_mutex->line_second = caller_line;
}
bcb->owner_mutex = index;
}
else
{
/* failed */
}
return rv;
}
/*
* pgbuf_bcbmon_unlock () - monitor and unlock BCB mutex
*
* return : void
* bcb (in) : BCB to unlock
*/
static void
pgbuf_bcbmon_unlock (PGBUF_BCB * bcb)
{
int index = thread_get_current_entry_index ();
PGBUF_MONITOR_BCB_MUTEX *monitor_bcb_mutex = &pgbuf_Pool.monitor.bcb_locks[index];
assert_release (pgbuf_Monitor_locks);
/* should be monitored */
if (bcb->owner_mutex != index)
{
/* I did not lock it?? */
PGBUF_ABORT_RELEASE ();
}
bcb->owner_mutex = -1;
if (monitor_bcb_mutex->bcb == bcb)
{
/* remove bcb from monitor. */
monitor_bcb_mutex->bcb = NULL;
}
else if (monitor_bcb_mutex->bcb_second == bcb)
{
/* remove bcb from monitor */
monitor_bcb_mutex->bcb_second = NULL;
}
else
{
/* I did not monitor it?? */
PGBUF_ABORT_RELEASE ();
}
pthread_mutex_unlock (&bcb->mutex);
}
/*
* pgbuf_bcbmon_check_own () - check current thread owns bcb mutex.
*
* return : void
* bcb (in) : BCB
*
* note: monitoring page buffer locks must be activated
*/
static void
pgbuf_bcbmon_check_own (PGBUF_BCB * bcb)
{
int index = thread_get_current_entry_index ();
PGBUF_MONITOR_BCB_MUTEX *monitor_bcb_mutex = &pgbuf_Pool.monitor.bcb_locks[index];
assert_release (pgbuf_Monitor_locks);
if (bcb->owner_mutex != index)
{
/* not owned */
PGBUF_ABORT_RELEASE ();
}
if (monitor_bcb_mutex->bcb != bcb && monitor_bcb_mutex->bcb_second != bcb)
{
/* not monitored? */
PGBUF_ABORT_RELEASE ();
}
}
/*
* pgbuf_bcbmon_check_mutex_leaks () - check for mutex leaks. must be called on exit points where no BCB should be
* locked.
*
* note: only works if page buffer lock monitoring is enabled.
*/
static void
pgbuf_bcbmon_check_mutex_leaks (void)
{
int index = thread_get_current_entry_index ();
PGBUF_MONITOR_BCB_MUTEX *monitor_bcb_mutex = &pgbuf_Pool.monitor.bcb_locks[index];
assert_release (pgbuf_Monitor_locks);
if (monitor_bcb_mutex->bcb != NULL)
{
PGBUF_ABORT_RELEASE ();
}
if (monitor_bcb_mutex->bcb_second != NULL)
{
PGBUF_ABORT_RELEASE ();
}
}
#endif /* SERVER_MODE */
/*
* pgbuf_flags_mask_sanity_check () - check flags mask do not overlap!
*
*/
static void
pgbuf_flags_mask_sanity_check (void)
{
/* sanity check: make sure the masks for bcb flags, zone and lru index do not overlap. this should be immediately
* caught, so abort the server whenever happens. */
if (PGBUF_BCB_FLAGS_MASK & PGBUF_ZONE_MASK)
{
PGBUF_ABORT_RELEASE ();
}
if (PGBUF_BCB_FLAGS_MASK & PGBUF_LRU_INDEX_MASK)
{
PGBUF_ABORT_RELEASE ();
}
if (PGBUF_ZONE_MASK & PGBUF_LRU_INDEX_MASK)
{
PGBUF_ABORT_RELEASE ();
}
if ((PGBUF_INVALID_ZONE | PGBUF_VOID_ZONE) & PGBUF_LRU_ZONE_MASK)
{
PGBUF_ABORT_RELEASE ();
}
}
/*
* pgbuf_lru_sanity_check () - check lru list is sane
*
* return : void
* lru (in) : lru list
*/
static void
pgbuf_lru_sanity_check (const PGBUF_LRU_LIST * lru)
{
#if !defined (NDEBUG)
if (lru->top == NULL)
{
/* empty list */
assert (lru->count_lru1 == 0 && lru->count_lru2 == 0 && lru->count_lru3 == 0 && lru->bottom == NULL
&& lru->bottom_1 == NULL && lru->bottom_2 == NULL);
return;
}
/* not empty */
assert (lru->bottom != NULL);
assert (lru->count_lru1 != 0 || lru->count_lru2 != 0 || lru->count_lru3 != 0);
/* zone 1 */
assert ((lru->count_lru1 == 0) == (lru->bottom_1 == NULL));
if (lru->bottom_1 != NULL)
{
assert (pgbuf_bcb_get_zone (lru->bottom_1) == PGBUF_LRU_1_ZONE);
assert (pgbuf_bcb_get_zone (lru->top) == PGBUF_LRU_1_ZONE);
if (lru->bottom_1->next_BCB != NULL)
{
if (pgbuf_bcb_get_zone (lru->bottom_1->next_BCB) == PGBUF_LRU_1_ZONE)
{
assert (false);
}
else if (pgbuf_bcb_get_zone (lru->bottom_1->next_BCB) == PGBUF_LRU_2_ZONE)
{
assert (lru->count_lru2 != 0 && lru->bottom_2 != NULL);
}
else
{
assert (lru->count_lru3 != 0);
}
}
else
{
assert (lru->count_lru2 == 0 && lru->count_lru3 == 0 && lru->bottom_2 == NULL
&& lru->bottom == lru->bottom_1);
}
}
/* zone 2 */
assert ((lru->count_lru2 == 0) == (lru->bottom_2 == NULL));
if (lru->bottom_2 != NULL)
{
assert (pgbuf_bcb_get_zone (lru->bottom_2) == PGBUF_LRU_2_ZONE);
assert (lru->bottom_2 != NULL || pgbuf_bcb_get_zone (lru->top) == PGBUF_LRU_2_ZONE);
if (lru->bottom_2->next_BCB != NULL)
{
if (pgbuf_bcb_get_zone (lru->bottom_2->next_BCB) == PGBUF_LRU_2_ZONE)
{
assert (false);
}
else if (pgbuf_bcb_get_zone (lru->bottom_2->next_BCB) == PGBUF_LRU_1_ZONE)
{
assert (false);
}
else if (lru->count_lru3 == 0)
{
assert (false);
}
}
else
{
assert (lru->count_lru3 == 0 && lru->bottom == lru->bottom_2);
}
}
#endif /* !NDEBUG */
}
// TODO: find a better place for this, but not log_impl.h
/*
* pgbuf_find_current_wait_msecs - find waiting times for current transaction
*
* return : wait_msecs...
*
* Note: Find the waiting time for the current transaction.
*/
STATIC_INLINE int
pgbuf_find_current_wait_msecs (THREAD_ENTRY * thread_p)
{
LOG_TDES *tdes; /* Transaction descriptor */
int tran_index;
tran_index = LOG_FIND_THREAD_TRAN_INDEX (thread_p);
tdes = LOG_FIND_TDES (tran_index);
if (tdes != NULL)
{
return tdes->wait_msecs;
}
else
{
return 0;
}
}
/*
* pgbuf_get_page_flush_interval () - setup page flush daemon period based on system parameter
*/
void
pgbuf_get_page_flush_interval (bool & is_timed_wait, cubthread::delta_time & period)
{
int page_flush_interval_msecs = prm_get_integer_value (PRM_ID_PAGE_BG_FLUSH_INTERVAL_MSECS);
assert (page_flush_interval_msecs >= 0);
if (page_flush_interval_msecs > 0)
{
// if page_flush_interval_msecs > 0 (zero) then loop for fixed interval
is_timed_wait = true;
period = std::chrono::milliseconds (page_flush_interval_msecs);
}
else
{
// infinite wait
is_timed_wait = false;
}
}
// *INDENT-OFF*
#if defined (SERVER_MODE)
static void
pgbuf_page_maintenance_execute (cubthread::entry & thread_ref)
{
if (!BO_IS_FLUSH_DAEMON_AVAILABLE ())
{
return;
}
/* page buffer maintenance thread adjust quota's based on thread activity. */
pgbuf_adjust_quotas (&thread_ref);
/* search lists and assign victims directly */
pgbuf_direct_victims_maintenance (&thread_ref);
}
#endif /* SERVER_MODE */
#if defined (SERVER_MODE)
// class pgbuf_page_flush_daemon_task
//
// description:
// page flush daemon task
//
class pgbuf_page_flush_daemon_task : public cubthread::entry_task
{
private:
PERF_UTIME_TRACKER m_perf_track;
public:
pgbuf_page_flush_daemon_task ()
{
PERF_UTIME_TRACKER_START (NULL, &m_perf_track);
}
void execute (cubthread::entry & thread_ref) override
{
if (!BO_IS_FLUSH_DAEMON_AVAILABLE ())
{
return;
}
// did not timeout, someone requested flush... run at least once
bool force_one_run = pgbuf_Page_flush_daemon->was_woken_up ();
bool stop_iteration = false;
/* flush pages as long as necessary */
while (force_one_run || pgbuf_keep_victim_flush_thread_running ())
{
pgbuf_flush_victim_candidates (&thread_ref, prm_get_float_value (PRM_ID_PB_BUFFER_FLUSH_RATIO), &m_perf_track,
&stop_iteration);
force_one_run = false;
if (stop_iteration)
{
break;
}
}
/* performance tracking */
if (m_perf_track.is_perf_tracking)
{
/* register sleep time. */
PERF_UTIME_TRACKER_TIME_AND_RESTART (&thread_ref, &m_perf_track, PSTAT_PB_FLUSH_SLEEP);
/* update is_perf_tracking */
m_perf_track.is_perf_tracking = perfmon_is_perf_tracking ();
}
else
{
/* update is_perf_tracking and start timer if it became true */
PERF_UTIME_TRACKER_START (&thread_ref, &m_perf_track);
}
}
};
#endif /* SERVER_MODE */
#if defined (SERVER_MODE)
static void
pgbuf_page_post_flush_execute (cubthread::entry & thread_ref)
{
if (!BO_IS_FLUSH_DAEMON_AVAILABLE ())
{
return;
}
/* assign flushed pages */
if (pgbuf_assign_flushed_pages (&thread_ref))
{
/* reset daemon looper and be prepared to start over */
pgbuf_Page_post_flush_daemon->reset_looper ();
}
}
#endif /* SERVER_MODE */
#if defined (SERVER_MODE)
// class pgbuf_flush_control_daemon_task
//
// description:
// flush control daemon task
//
class pgbuf_flush_control_daemon_task : public cubthread::entry_task
{
private:
struct timeval m_end;
bool m_first_run;
public:
pgbuf_flush_control_daemon_task ()
: m_end ({0, 0})
, m_first_run (true)
{
}
int initialize ()
{
return fileio_flush_control_initialize ();
}
void execute (cubthread::entry & thread_ref) override
{
if (!BO_IS_FLUSH_DAEMON_AVAILABLE ())
{
return;
}
if (m_first_run)
{
gettimeofday (&m_end, NULL);
m_first_run = false;
return;
}
struct timeval begin, diff;
int token_gen, token_consumed;
gettimeofday (&begin, NULL);
perfmon_diff_timeval (&diff, &m_end, &begin);
int64_t diff_usec = diff.tv_sec * 1000000LL + diff.tv_usec;
fileio_flush_control_add_tokens (&thread_ref, diff_usec, &token_gen, &token_consumed);
gettimeofday (&m_end, NULL);
}
void retire (void) override
{
fileio_flush_control_finalize ();
delete this;
}
};
#endif /* SERVER_MODE */
#if defined (SERVER_MODE)
/*
* pgbuf_page_maintenance_daemon_init () - initialize page maintenance daemon thread
*/
REGISTER_DAEMON (pgbuf_page_maintenance);
void
pgbuf_page_maintenance_daemon_init ()
{
assert (pgbuf_Page_maintenance_daemon == NULL);
cubthread::looper looper = cubthread::looper (std::chrono::milliseconds (100));
cubthread::entry_callable_task *daemon_task = new cubthread::entry_callable_task (pgbuf_page_maintenance_execute);
pgbuf_Page_maintenance_daemon = cubthread::get_manager ()->create_daemon (looper, daemon_task, "pgbuf-maintain");
}
#endif /* SERVER_MODE */
#if defined (SERVER_MODE)
/*
* pgbuf_page_flush_daemon_init () - initialize page flush daemon thread
*/
REGISTER_DAEMON (pgbuf_page_flush);
void
pgbuf_page_flush_daemon_init ()
{
assert (pgbuf_Page_flush_daemon == NULL);
cubthread::looper looper = cubthread::looper (pgbuf_get_page_flush_interval);
pgbuf_page_flush_daemon_task *daemon_task = new pgbuf_page_flush_daemon_task ();
pgbuf_Page_flush_daemon = cubthread::get_manager ()->create_daemon (looper, daemon_task, "pgbuf-page-flush");
}
#endif /* SERVER_MODE */
#if defined (SERVER_MODE)
/*
* pgbuf_page_post_flush_daemon_init () - initialize page post flush daemon thread
*/
REGISTER_DAEMON (pgbuf_page_post_flush);
void
pgbuf_page_post_flush_daemon_init ()
{
assert (pgbuf_Page_post_flush_daemon == NULL);
std::array<cubthread::delta_time, 3> looper_interval {{
std::chrono::milliseconds (1),
std::chrono::milliseconds (10),
std::chrono::milliseconds (100)
}};
cubthread::looper looper = cubthread::looper (looper_interval);
cubthread::entry_callable_task *daemon_task = new cubthread::entry_callable_task (pgbuf_page_post_flush_execute);
pgbuf_Page_post_flush_daemon = cubthread::get_manager ()->create_daemon (looper, daemon_task, "pgbuf-page-post-flush");
}
#endif /* SERVER_MODE */
#if defined (SERVER_MODE)
/*
* pgbuf_flush_control_daemon_init () - initialize flush control daemon thread
*/
REGISTER_DAEMON (pgbuf_flush_control);
void
pgbuf_flush_control_daemon_init ()
{
assert (pgbuf_Flush_control_daemon == NULL);
pgbuf_flush_control_daemon_task *daemon_task = new pgbuf_flush_control_daemon_task ();
if (daemon_task->initialize () != NO_ERROR)
{
delete daemon_task;
return;
}
cubthread::looper looper = cubthread::looper (std::chrono::milliseconds (50));
pgbuf_Flush_control_daemon = cubthread::get_manager ()->create_daemon (looper, daemon_task, "pgbuf-flush-control");
}
#endif /* SERVER_MODE */
#if defined (SERVER_MODE)
/*
* pgbuf_daemons_init () - initialize page buffer daemon threads
*/
void
pgbuf_daemons_init ()
{
pgbuf_page_maintenance_daemon_init ();
pgbuf_page_flush_daemon_init ();
pgbuf_page_post_flush_daemon_init ();
pgbuf_flush_control_daemon_init ();
}
#endif /* SERVER_MODE */
#if defined (SERVER_MODE)
/*
* pgbuf_daemons_destroy () - destroy page buffer daemon threads
*/
void
pgbuf_daemons_destroy ()
{
cubthread::get_manager ()->destroy_daemon (pgbuf_Page_maintenance_daemon);
cubthread::get_manager ()->destroy_daemon (pgbuf_Page_flush_daemon);
cubthread::get_manager ()->destroy_daemon (pgbuf_Page_post_flush_daemon);
cubthread::get_manager ()->destroy_daemon (pgbuf_Flush_control_daemon);
}
#endif /* SERVER_MODE */
void
pgbuf_daemons_get_stats (UINT64 * stats_out)
{
#if defined (SERVER_MODE)
UINT64 *statsp = stats_out;
if (pgbuf_Page_flush_daemon != NULL)
{
pgbuf_Page_flush_daemon->get_stats (statsp);
}
statsp += cubthread::daemon::get_stats_value_count ();
if (pgbuf_Page_post_flush_daemon != NULL)
{
pgbuf_Page_post_flush_daemon->get_stats (statsp);
}
statsp += cubthread::daemon::get_stats_value_count ();
if (pgbuf_Flush_control_daemon != NULL)
{
pgbuf_Flush_control_daemon->get_stats (statsp);
}
statsp += cubthread::daemon::get_stats_value_count ();
if (pgbuf_Page_maintenance_daemon != NULL)
{
pgbuf_Page_maintenance_daemon->get_stats (statsp);
}
#endif
}
// *INDENT-ON*
/*
* pgbuf_is_page_flush_daemon_available () - check if page flush daemon is available
* return: true if page flush daemon is available, false otherwise
*/
static bool
pgbuf_is_page_flush_daemon_available ()
{
#if defined (SERVER_MODE)
return pgbuf_Page_flush_daemon != NULL;
#else
return false;
#endif
}
static bool
pgbuf_is_temp_lsa (const log_lsa & lsa)
{
return lsa == PGBUF_TEMP_LSA;
}
static void
pgbuf_init_temp_page_lsa (FILEIO_PAGE * io_page, PGLENGTH page_size)
{
io_page->prv.lsa = PGBUF_TEMP_LSA;
FILEIO_PAGE_WATERMARK *prv2 = fileio_get_page_watermark_pos (io_page, page_size);
prv2->lsa = PGBUF_TEMP_LSA;
}
/*
* pgbuf_scan_bcb_table () - scan bcb table to count snapshot data with no bcb mutex
*/
static void
pgbuf_scan_bcb_table ()
{
int bufid;
int flags;
PGBUF_BCB *bufptr;
PAGE_TYPE page_type;
VPID vpid;
PGBUF_STATUS_SNAPSHOT *show_status_snapshot = &pgbuf_Pool.show_status_snapshot;
memset (show_status_snapshot, 0, sizeof (PGBUF_STATUS_SNAPSHOT));
for (bufid = 0; bufid < pgbuf_Pool.num_buffers; bufid++)
{
bufptr = PGBUF_FIND_BCB_PTR (bufid);
page_type = (PAGE_TYPE) (bufptr->iopage_buffer->iopage.prv.ptype);
vpid = bufptr->vpid;
flags = bufptr->flags;
if ((flags & PGBUF_BCB_DIRTY_FLAG) != 0)
{
show_status_snapshot->dirty_pages++;
}
else
{
show_status_snapshot->clean_pages++;
}
if ((flags & PGBUF_INVALID_ZONE) != 0)
{
show_status_snapshot->free_pages++;
continue;
}
if ((PGBUF_GET_ZONE (flags) == PGBUF_LRU_3_ZONE) && (flags & PGBUF_BCB_DIRTY_FLAG) != 0)
{
show_status_snapshot->victim_candidate_pages++;
}
/* count temporary and permanent pages */
if (pgbuf_is_temporary_volume (vpid.volid) == true)
{
show_status_snapshot->num_temp_pages++;
assert ((page_type == PAGE_UNKNOWN) || /* dealloc pages, we don't know page type */
(page_type == PAGE_AREA) || (page_type == PAGE_QRESULT) || /* temporary page type */
(page_type == PAGE_EHASH) || (page_type == PAGE_VOLHEADER) /* It can be temporary or permanent pages */
|| (page_type == PAGE_VOLBITMAP) || (page_type == PAGE_FTAB)); /* It can be temporary or permanent pages */
}
else
{
switch (page_type)
{
case PAGE_BTREE:
show_status_snapshot->num_index_pages++;
break;
case PAGE_OVERFLOW:
case PAGE_HEAP:
show_status_snapshot->num_data_pages++;
break;
case PAGE_CATALOG:
case PAGE_VOLBITMAP:
case PAGE_VOLHEADER:
case PAGE_FTAB:
case PAGE_EHASH:
case PAGE_VACUUM_DATA:
case PAGE_DROPPED_FILES:
show_status_snapshot->num_system_pages++;
break;
default:
/* dealloc pages, we don't know page type */
assert (page_type == PAGE_UNKNOWN);
break;
}
}
}
}
/*
* pgbuf_start_scan () - start scan function for show page buffer status
* return: NO_ERROR, or ER_code
*
* thread_p(in):
* type (in):
* arg_values(in):
* arg_cnt(in):
* ptr(in/out):
*/
int
pgbuf_start_scan (THREAD_ENTRY * thread_p, int type, DB_VALUE ** arg_values, int arg_cnt, void **ptr)
{
SHOWSTMT_ARRAY_CONTEXT *ctx = NULL;
const int num_cols = 19;
time_t cur_time;
int idx, i;
int error = NO_ERROR;
DB_VALUE *vals = NULL, db_val;
unsigned long long delta, hit_delta, request_delta;
double time_delta;
double hit_rate;
DB_DATA_STATUS data_status;
PGBUF_STATUS status_accumulated = { };
PGBUF_STATUS_SNAPSHOT *status_snapshot = &pgbuf_Pool.show_status_snapshot;
PGBUF_STATUS_OLD *status_old = &pgbuf_Pool.show_status_old;
*ptr = NULL;
#if defined(SERVER_MODE)
(void) pthread_mutex_lock (&pgbuf_Pool.show_status_mutex);
#endif
pgbuf_scan_bcb_table ();
for (i = 0; i <= MAX_NTRANS; i++)
{
status_accumulated.num_hit += pgbuf_Pool.show_status[i].num_hit;
status_accumulated.num_page_request += pgbuf_Pool.show_status[i].num_page_request;
status_accumulated.num_pages_created += pgbuf_Pool.show_status[i].num_pages_created;
status_accumulated.num_pages_written += pgbuf_Pool.show_status[i].num_pages_written;
status_accumulated.num_pages_read += pgbuf_Pool.show_status[i].num_pages_read;
status_accumulated.num_flusher_waiting_threads += pgbuf_Pool.show_status[i].num_flusher_waiting_threads;
}
ctx = showstmt_alloc_array_context (thread_p, 1, num_cols);
if (ctx == NULL)
{
error = er_errid ();
return error;
}
vals = showstmt_alloc_tuple_in_context (thread_p, ctx);
if (vals == NULL)
{
error = er_errid ();
goto exit_on_error;
}
cur_time = time (NULL);
time_delta = difftime (cur_time, status_old->print_out_time) + 0.0001; // avoid dividing by 0
idx = 0;
hit_rate = (status_accumulated.num_hit - status_old->num_hit) /
((status_accumulated.num_page_request - status_old->num_page_request) + 0.0000000000001);
hit_rate = hit_rate * 100;
db_make_double (&db_val, hit_rate);
db_value_domain_init (&vals[idx], DB_TYPE_NUMERIC, 13, 10);
error = numeric_db_value_coerce_to_num (&db_val, &vals[idx], &data_status);
idx++;
if (error != NO_ERROR)
{
goto exit_on_error;
}
delta = status_accumulated.num_hit - status_old->num_hit;
db_make_bigint (&vals[idx], delta);
idx++;
delta = status_accumulated.num_page_request - status_old->num_page_request;
db_make_bigint (&vals[idx], delta);
idx++;
db_make_int (&vals[idx], pgbuf_Pool.num_buffers);
idx++;
db_make_int (&vals[idx], PGBUF_IOPAGE_BUFFER_SIZE);
idx++;
db_make_int (&vals[idx], status_snapshot->free_pages);
idx++;
db_make_int (&vals[idx], status_snapshot->victim_candidate_pages);
idx++;
db_make_int (&vals[idx], status_snapshot->clean_pages);
idx++;
db_make_int (&vals[idx], status_snapshot->dirty_pages);
idx++;
db_make_int (&vals[idx], status_snapshot->num_index_pages);
idx++;
db_make_int (&vals[idx], status_snapshot->num_data_pages);
idx++;
db_make_int (&vals[idx], status_snapshot->num_system_pages);
idx++;
db_make_int (&vals[idx], status_snapshot->num_temp_pages);
idx++;
delta = status_accumulated.num_pages_created - status_old->num_pages_created;
db_make_bigint (&vals[idx], delta);
idx++;
delta = status_accumulated.num_pages_written - status_old->num_pages_written;
db_make_bigint (&vals[idx], delta);
idx++;
db_make_double (&db_val, delta / time_delta);
db_value_domain_init (&vals[idx], DB_TYPE_NUMERIC, 20, 10);
error = numeric_db_value_coerce_to_num (&db_val, &vals[idx], &data_status);
idx++;
if (error != NO_ERROR)
{
goto exit_on_error;
}
delta = status_accumulated.num_pages_read - status_old->num_pages_read;
db_make_bigint (&vals[idx], delta);
idx++;
db_make_double (&db_val, delta / time_delta);
db_value_domain_init (&vals[idx], DB_TYPE_NUMERIC, 20, 10);
error = numeric_db_value_coerce_to_num (&db_val, &vals[idx], &data_status);
idx++;
if (error != NO_ERROR)
{
goto exit_on_error;
}
db_make_int (&vals[idx], status_accumulated.num_flusher_waiting_threads);
idx++;
assert (idx == num_cols);
/* set now data to old data */
status_old->num_hit = status_accumulated.num_hit;
status_old->num_page_request = status_accumulated.num_page_request;
status_old->num_pages_created = status_accumulated.num_pages_created;
status_old->num_pages_written = status_accumulated.num_pages_written;
status_old->num_pages_read = status_accumulated.num_pages_read;
status_old->print_out_time = cur_time;
*ptr = ctx;
#if defined(SERVER_MODE)
pthread_mutex_unlock (&pgbuf_Pool.show_status_mutex);
#endif
return NO_ERROR;
exit_on_error:
if (ctx != NULL)
{
showstmt_free_array_context (thread_p, ctx);
}
#if defined(SERVER_MODE)
pthread_mutex_unlock (&pgbuf_Pool.show_status_mutex);
#endif
return error;
}