/*- * See the file LICENSE for redistribution information. * * Copyright (c) 2001-2003 * Sleepycat Software. All rights reserved. */ #ifndef _REP_H_ #define _REP_H_ #define REP_ALIVE 1 /* I am alive message. */ #define REP_ALIVE_REQ 2 /* Request for alive messages. */ #define REP_ALL_REQ 3 /* Request all log records greater than LSN. */ #define REP_DUPMASTER 4 /* Duplicate master detected; propagate. */ #define REP_FILE 5 /* Page of a database file. */ #define REP_FILE_REQ 6 /* Request for a database file. */ #define REP_LOG 7 /* Log record. */ #define REP_LOG_MORE 8 /* There are more log records to request. */ #define REP_LOG_REQ 9 /* Request for a log record. */ #define REP_MASTER_REQ 10 /* Who is the master */ #define REP_NEWCLIENT 11 /* Announces the presence of a new client. */ #define REP_NEWFILE 12 /* Announce a log file change. */ #define REP_NEWMASTER 13 /* Announces who the master is. */ #define REP_NEWSITE 14 /* Announces that a site has heard from a new * site; like NEWCLIENT, but indirect. A * NEWCLIENT message comes directly from the new * client while a NEWSITE comes indirectly from * someone who heard about a NEWSITE. */ #define REP_PAGE 15 /* Database page. */ #define REP_PAGE_REQ 16 /* Request for a database page. */ #define REP_PLIST 17 /* Database page list. */ #define REP_PLIST_REQ 18 /* Request for a page list. */ #define REP_VERIFY 19 /* A log record for verification. */ #define REP_VERIFY_FAIL 20 /* The client is outdated. */ #define REP_VERIFY_REQ 21 /* Request for a log record to verify. */ #define REP_VOTE1 22 /* Send out your information for an election. */ #define REP_VOTE2 23 /* Send a "you are master" vote. */ /* Shared replication structure. */ typedef struct __rep { /* * Due to alignment constraints on some architectures (e.g. HP-UX), * DB_MUTEXes must be the first element of shalloced structures, * and as a corollary there can be only one per structure. Thus, * db_mutex_off points to a mutex in a separately-allocated chunk. */ DB_MUTEX mutex; /* Region lock. */ roff_t db_mutex_off; /* Client database mutex. */ u_int32_t tally_off; /* Offset of the tally region. */ u_int32_t v2tally_off; /* Offset of the vote2 tally region. */ int eid; /* Environment id. */ int master_id; /* ID of the master site. */ u_int32_t egen; /* Replication election generation. */ u_int32_t gen; /* Replication generation number. */ u_int32_t recover_gen; /* Last generation number in log. */ int asites; /* Space allocated for sites. */ int nsites; /* Number of sites in group. */ int priority; /* My priority in an election. */ u_int32_t gbytes; /* Limit on data sent in single... */ u_int32_t bytes; /* __rep_process_message call. */ #define DB_REP_REQUEST_GAP 4 #define DB_REP_MAX_GAP 128 u_int32_t request_gap; /* # of records to receive before we * request a missing log record. */ u_int32_t max_gap; /* Maximum number of records before * requesting a missing log record. */ /* Status change information */ u_int32_t msg_th; /* Number of callers in rep_proc_msg. */ int start_th; /* A thread is in rep_start. */ u_int32_t handle_cnt; /* Count of handles in library. */ u_int32_t op_cnt; /* Multi-step operation count.*/ int in_recovery; /* Running recovery now. */ time_t timestamp; /* Recovery timestamp. */ /* Vote tallying information. */ int sites; /* Sites heard from. */ int winner; /* Current winner. */ int w_priority; /* Winner priority. */ u_int32_t w_gen; /* Winner generation. */ DB_LSN w_lsn; /* Winner LSN. */ int w_tiebreaker; /* Winner tiebreaking value. */ int votes; /* Number of votes for this site. */ /* Statistics. */ DB_REP_STAT stat; #define REP_F_EPHASE1 0x001 /* In phase 1 of election. */ #define REP_F_EPHASE2 0x002 /* In phase 2 of election. */ #define REP_F_LOGSONLY 0x004 /* Log only; can't upgrade. */ #define REP_F_MASTER 0x008 /* Master replica. */ #define REP_F_MASTERELECT 0x010 /* Master elect */ #define REP_F_NOARCHIVE 0x020 /* Rep blocks log_archive */ #define REP_F_READY 0x040 /* Wait for txn_cnt to be 0. */ #define REP_F_RECOVER 0x080 /* In recovery. */ #define REP_F_TALLY 0x100 /* Tallied vote before elect. */ #define REP_F_UPGRADE 0x200 /* Upgradeable replica. */ #define REP_ISCLIENT (REP_F_UPGRADE | REP_F_LOGSONLY) u_int32_t flags; } REP; #define IN_ELECTION(R) F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2) #define IN_ELECTION_TALLY(R) \ F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY) #define IS_REP_MASTER(dbenv) \ (REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region && \ F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \ REP_F_MASTER)) #define IS_REP_CLIENT(dbenv) \ (REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region && \ F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \ REP_ISCLIENT)) #define IS_REP_LOGSONLY(dbenv) \ (REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region && \ F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \ REP_F_LOGSONLY)) /* * Macros to figure out if we need to do replication pre/post-amble * processing. */ #define IS_REPLICATED(E, D) \ (!F_ISSET((D), DB_AM_RECOVER | DB_AM_REPLICATION) && \ REP_ON(E) && ((DB_REP *)((E)->rep_handle))->region != NULL && \ ((DB_REP *)((E)->rep_handle))->region->flags != 0) #define IS_ENV_REPLICATED(E) (!IS_RECOVERING(E) && REP_ON(E) && \ ((DB_REP *)((E)->rep_handle))->region != NULL && \ ((DB_REP *)((E)->rep_handle))->region->flags != 0) /* * Per-process replication structure. * * There are 2 mutexes used in replication. * 1. rep_mutexp - This protects the fields of the rep region above. * 2. db_mutexp - This protects the bookkeeping database and all * of the components that maintain it. Those components include * the following fields in the log region (see log.h): * a. ready_lsn * b. waiting_lsn * c. verify_lsn * d. wait_recs * e. rcvd_recs * f. max_wait_lsn * These fields in the log region are NOT protected by the log * region lock at all. * * The lock ordering protocol is that db_mutexp must be acquired * first and then either rep_mutexp, or the log region mutex may * be acquired if necessary. */ struct __db_rep { DB_MUTEX *rep_mutexp; /* Mutex for rep region */ DB_MUTEX *db_mutexp; /* Mutex for bookkeeping database. */ DB *rep_db; /* Bookkeeping database. */ REP *region; /* In memory structure. */ }; /* * Control structure for replication communication infrastructure. * * Note that the version information should be at the beginning of the * structure, so that we can rearrange the rest of it while letting the * version checks continue to work. DB_REPVERSION should be revved any time * the rest of the structure changes. */ typedef struct __rep_control { #define DB_REPVERSION 1 u_int32_t rep_version; /* Replication version number. */ u_int32_t log_version; /* Log version number. */ DB_LSN lsn; /* Log sequence number. */ u_int32_t rectype; /* Message type. */ u_int32_t gen; /* Generation number. */ u_int32_t flags; /* log_put flag value. */ } REP_CONTROL; /* Election vote information. */ typedef struct __rep_vote { u_int32_t egen; /* Election generation. */ int nsites; /* Number of sites I've been in * communication with. */ int priority; /* My site's priority. */ int tiebreaker; /* Tie-breaking quasi-random int. */ } REP_VOTE_INFO; typedef struct __rep_vtally { u_int32_t egen; /* Voter's election generation. */ int eid; /* Voter's ID. */ } REP_VTALLY; /* * This structure takes care of representing a transaction. * It holds all the records, sorted by page number so that * we can obtain locks and apply updates in a deadlock free * order. */ typedef struct __lsn_page { DB_LSN lsn; int32_t fid; DB_LOCK_ILOCK pgdesc; #define LSN_PAGE_NOLOCK 0x0001 /* No lock necessary for log rec. */ u_int32_t flags; } LSN_PAGE; typedef struct __txn_recs { int npages; int nalloc; LSN_PAGE *array; u_int32_t txnid; u_int32_t lockid; } TXN_RECS; typedef struct __lsn_collection { int nlsns; int nalloc; DB_LSN *array; } LSN_COLLECTION; /* * This is used by the page-prep routines to do the lock_vec call to * apply the updates for a single transaction or a collection of * transactions. */ typedef struct _linfo { int n; DB_LOCKREQ *reqs; DBT *objs; } linfo_t; #include "dbinc_auto/rep_ext.h" #endif /* !_REP_H_ */