Files
cpython-source-deps/env/env_region.c
2017-09-04 13:40:25 -05:00

1376 lines
37 KiB
C

/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996,2008 Oracle. All rights reserved.
*
* $Id: env_region.c 63573 2008-05-23 21:43:21Z trent.nelson $
*/
#include "db_config.h"
#include "db_int.h"
#include "dbinc/mp.h"
static int __env_des_get __P((ENV *, REGINFO *, REGINFO *, REGION **));
static int __env_faultmem __P((ENV *, void *, size_t, int));
static int __env_sys_attach __P((ENV *, REGINFO *, REGION *));
static int __env_sys_detach __P((ENV *, REGINFO *, int));
static void __env_des_destroy __P((ENV *, REGION *));
static void __env_remove_file __P((ENV *));
/*
* __env_attach
* Join/create the environment
*
* PUBLIC: int __env_attach __P((ENV *, u_int32_t *, int, int));
*/
int
__env_attach(env, init_flagsp, create_ok, retry_ok)
ENV *env;
u_int32_t *init_flagsp;
int create_ok, retry_ok;
{
DB_ENV *dbenv;
REGENV *renv;
REGENV_REF ref;
REGINFO *infop;
REGION *rp, tregion;
size_t nrw, size;
u_int32_t bytes, i, mbytes, nregions, signature;
u_int retry_cnt;
int majver, minver, patchver, ret, segid;
char buf[sizeof(DB_REGION_FMT) + 20];
/* Initialization */
dbenv = env->dbenv;
retry_cnt = 0;
signature = __env_struct_sig();
/* Repeated initialization. */
loop: renv = NULL;
/* Set up the ENV's REG_INFO structure. */
if ((ret = __os_calloc(env, 1, sizeof(REGINFO), &infop)) != 0)
return (ret);
infop->env = env;
infop->type = REGION_TYPE_ENV;
infop->id = REGION_ID_ENV;
infop->flags = REGION_JOIN_OK;
if (create_ok)
F_SET(infop, REGION_CREATE_OK);
/* Build the region name. */
if (F_ISSET(env, ENV_PRIVATE))
ret = __os_strdup(env, "process-private", &infop->name);
else {
(void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
ret =
__db_appname(env, DB_APP_NONE, buf, 0, NULL, &infop->name);
}
if (ret != 0)
goto err;
/*
* We have to single-thread the creation of the REGENV region. Once
* it exists, we can serialize using region mutexes, but until then
* we have to be the only player in the game.
*
* If this is a private environment, we are only called once and there
* are no possible race conditions.
*
* If this is a public environment, we use the filesystem to ensure
* the creation of the environment file is single-threaded.
*
* If the application has specified their own mapping functions, try
* and create the region. The application will have to let us know if
* it's actually a creation or not, and we'll have to fall-back to a
* join if it's not a create.
*/
if (F_ISSET(env, ENV_PRIVATE) || DB_GLOBAL(j_region_map) != NULL)
goto creation;
/*
* Try to create the file, if we have the authority. We have to ensure
* that multiple threads/processes attempting to simultaneously create
* the file are properly ordered. Open using the O_CREAT and O_EXCL
* flags so that multiple attempts to create the region will return
* failure in all but one. POSIX 1003.1 requires that EEXIST be the
* errno return value -- I sure hope they're right.
*/
if (create_ok) {
if ((ret = __os_open(env, infop->name, 0,
DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_REGION,
env->db_mode, &env->lockfhp)) == 0)
goto creation;
if (ret != EEXIST) {
__db_err(env, ret, "%s", infop->name);
goto err;
}
}
/* The region must exist, it's not okay to recreate it. */
F_CLR(infop, REGION_CREATE_OK);
/*
* If we couldn't create the file, try and open it. (If that fails,
* we're done.)
*/
if ((ret = __os_open(
env, infop->name, 0, DB_OSO_REGION, 0, &env->lockfhp)) != 0)
goto err;
/*
* !!!
* The region may be in system memory not backed by the filesystem
* (more specifically, not backed by this file), and we're joining
* it. In that case, the process that created it will have written
* out a REGENV_REF structure as its only contents. We read that
* structure before we do anything further, e.g., we can't just map
* that file in and then figure out what's going on.
*
* All of this noise is because some systems don't have a coherent VM
* and buffer cache, and what's worse, when you mix operations on the
* VM and buffer cache, half the time you hang the system.
*
* If the file is the size of an REGENV_REF structure, then we know
* the real region is in some other memory. (The only way you get a
* file that size is to deliberately write it, as it's smaller than
* any possible disk sector created by writing a file or mapping the
* file into memory.) In which case, retrieve the structure from the
* file and use it to acquire the referenced memory.
*
* If the structure is larger than a REGENV_REF structure, then this
* file is backing the shared memory region, and we just map it into
* memory.
*
* And yes, this makes me want to take somebody and kill them. (I
* digress -- but you have no freakin' idea. This is unbelievably
* stupid and gross, and I've probably spent six months of my life,
* now, trying to make different versions of it work.)
*/
if ((ret = __os_ioinfo(env, infop->name,
env->lockfhp, &mbytes, &bytes, NULL)) != 0) {
__db_err(env, ret, "%s", infop->name);
goto err;
}
/*
* !!!
* A size_t is OK -- regions get mapped into memory, and so can't
* be larger than a size_t.
*/
size = mbytes * MEGABYTE + bytes;
/*
* If the size is less than the size of a REGENV_REF structure, the
* region (or, possibly, the REGENV_REF structure) has not yet been
* completely written. Shouldn't be possible, but there's no reason
* not to wait awhile and try again.
*
* Otherwise, if the size is the size of a REGENV_REF structure,
* read it into memory and use it as a reference to the real region.
*/
if (size <= sizeof(ref)) {
if (size != sizeof(ref))
goto retry;
if ((ret = __os_read(env, env->lockfhp, &ref,
sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) {
if (ret == 0)
ret = EIO;
__db_err(env, ret,
"%s: unable to read system-memory information",
infop->name);
goto err;
}
size = ref.size;
segid = ref.segid;
F_SET(env, ENV_SYSTEM_MEM);
} else if (F_ISSET(env, ENV_SYSTEM_MEM)) {
ret = EINVAL;
__db_err(env, ret,
"%s: existing environment not created in system memory",
infop->name);
goto err;
} else
segid = INVALID_REGION_SEGID;
#ifndef HAVE_MUTEX_FCNTL
/*
* If we're not doing fcntl locking, we can close the file handle. We
* no longer need it and the less contact between the buffer cache and
* the VM, the better.
*/
(void)__os_closehandle(env, env->lockfhp);
env->lockfhp = NULL;
#endif
/* Call the region join routine to acquire the region. */
memset(&tregion, 0, sizeof(tregion));
tregion.size = (roff_t)size;
tregion.segid = segid;
if ((ret = __env_sys_attach(env, infop, &tregion)) != 0)
goto err;
user_map_functions:
/*
* The environment's REGENV structure has to live at offset 0 instead
* of the usual alloc information. Set the primary reference and
* correct the "addr" value to reference the alloc region. Note,
* this means that all of our offsets (R_ADDR/R_OFFSET) get shifted
* as well, but that should be fine.
*/
infop->primary = infop->addr;
infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
renv = infop->primary;
/*
* Make sure the region matches our build. Special case a region
* that's all nul bytes, just treat it like any other corruption.
*/
if (renv->majver != DB_VERSION_MAJOR ||
renv->minver != DB_VERSION_MINOR) {
if (renv->majver != 0 || renv->minver != 0) {
__db_errx(env,
"Program version %d.%d doesn't match environment version %d.%d",
DB_VERSION_MAJOR, DB_VERSION_MINOR,
renv->majver, renv->minver);
ret = DB_VERSION_MISMATCH;
} else
ret = EINVAL;
goto err;
}
if (renv->signature != signature) {
__db_errx(env, "Build signature doesn't match environment");
ret = DB_VERSION_MISMATCH;
goto err;
}
/*
* Check if the environment has had a catastrophic failure.
*
* Check the magic number to ensure the region is initialized. If the
* magic number isn't set, the lock may not have been initialized, and
* an attempt to use it could lead to random behavior.
*
* The panic and magic values aren't protected by any lock, so we never
* use them in any check that's more complex than set/not-set.
*
* !!!
* I'd rather play permissions games using the underlying file, but I
* can't because Windows/NT filesystems won't open files mode 0.
*/
if (renv->panic && !F_ISSET(dbenv, DB_ENV_NOPANIC)) {
ret = __env_panic_msg(env);
goto err;
}
if (renv->magic != DB_REGION_MAGIC)
goto retry;
/*
* Get a reference to the underlying REGION information for this
* environment.
*/
if ((ret = __env_des_get(env, infop, infop, &rp)) != 0 || rp == NULL)
goto find_err;
infop->rp = rp;
/*
* There's still a possibility for inconsistent data. When we acquired
* the size of the region and attached to it, it might have still been
* growing as part of its creation. We can detect this by checking the
* size we originally found against the region's current size. (The
* region's current size has to be final, the creator finished growing
* it before setting the magic number in the region.)
*
* !!!
* Skip this test when the application specified its own map functions.
* The size of the region is essentially unknown in that case: some
* other process asked the application's map function for some bytes,
* but we were never told the final size of the region. We could get
* a size back from the map function, but for all we know, our process'
* map function only knows how to join regions, it has no clue how big
* those regions are.
*/
if (DB_GLOBAL(j_region_map) == NULL && rp->size != size)
goto retry;
/*
* Check our callers configuration flags, it's an error to configure
* incompatible or additional subsystems in an existing environment.
* Return the total set of flags to the caller so they initialize the
* correct set of subsystems.
*/
if (init_flagsp != NULL) {
FLD_CLR(*init_flagsp, renv->init_flags);
if (*init_flagsp != 0) {
__db_errx(env,
"configured environment flags incompatible with existing environment");
ret = EINVAL;
goto err;
}
*init_flagsp = renv->init_flags;
}
/*
* Fault the pages into memory. Note, do this AFTER releasing the
* lock, because we're only reading the pages, not writing them.
*/
(void)__env_faultmem(env, infop->primary, rp->size, 0);
/* Everything looks good, we're done. */
env->reginfo = infop;
return (0);
creation:
/* Create the environment region. */
F_SET(infop, REGION_CREATE);
/*
* Allocate room for REGION structures plus overhead.
*
* XXX
* Overhead is so high because encryption passwds, replication vote
* arrays and the thread control block table are all stored in the
* base environment region. This is a bug, at the least replication
* should have its own region.
*
* Allocate space for thread info blocks. Max is only advisory,
* so we allocate 25% more.
*/
memset(&tregion, 0, sizeof(tregion));
nregions = __memp_max_regions(env) + 10;
size = nregions * sizeof(REGION);
size += dbenv->passwd_len;
size += (dbenv->thr_max + dbenv->thr_max / 4) *
__env_alloc_size(sizeof(DB_THREAD_INFO));
size += env->thr_nbucket * __env_alloc_size(sizeof(DB_HASHTAB));
size += 16 * 1024;
tregion.size = size;
tregion.segid = INVALID_REGION_SEGID;
if ((ret = __env_sys_attach(env, infop, &tregion)) != 0)
goto err;
/*
* If the application has specified its own mapping functions, we don't
* know until we get here if we are creating the region or not. The
* way we find out is underlying functions clear the REGION_CREATE flag.
*/
if (!F_ISSET(infop, REGION_CREATE))
goto user_map_functions;
/*
* Fault the pages into memory. Note, do this BEFORE we initialize
* anything, because we're writing the pages, not just reading them.
*/
(void)__env_faultmem(env, infop->addr, tregion.size, 1);
/*
* The first object in the region is the REGENV structure. This is
* different from the other regions, and, from everything else in
* this region, where all objects are allocated from the pool, i.e.,
* there aren't any fixed locations. The remaining space is made
* available for later allocation.
*
* The allocation space must be size_t aligned, because that's what
* the initialization routine is going to store there. To make sure
* that happens, the REGENV structure was padded with a final size_t.
* No other region needs to worry about it because all of them treat
* the entire region as allocation space.
*
* Set the primary reference and correct the "addr" value to reference
* the alloc region. Note, this requires that we "uncorrect" it at
* region detach, and that all of our offsets (R_ADDR/R_OFFSET) will be
* shifted as well, but that should be fine.
*/
infop->primary = infop->addr;
infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
__env_alloc_init(infop, tregion.size - sizeof(REGENV));
/*
* Initialize the rest of the REGENV structure. (Don't set the magic
* number to the correct value, that would validate the environment).
*/
renv = infop->primary;
renv->magic = 0;
renv->panic = 0;
(void)db_version(&majver, &minver, &patchver);
renv->majver = (u_int32_t)majver;
renv->minver = (u_int32_t)minver;
renv->patchver = (u_int32_t)patchver;
renv->signature = signature;
(void)time(&renv->timestamp);
__os_unique_id(env, &renv->envid);
/*
* Initialize init_flags to store the flags that any other environment
* handle that uses DB_JOINENV to join this environment will need.
*/
renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp;
/*
* Set up the region array. We use an array rather than a linked list
* as we have to traverse this list after failure in some cases, and
* we don't want to infinitely loop should the application fail while
* we're manipulating the list.
*/
renv->region_cnt = nregions;
if ((ret = __env_alloc(infop, nregions * sizeof(REGION), &rp)) != 0) {
__db_err(
env, ret, "unable to create new master region array");
goto err;
}
renv->region_off = R_OFFSET(infop, rp);
for (i = 0; i < nregions; ++i, ++rp)
rp->id = INVALID_REGION_ID;
renv->cipher_off = renv->thread_off = renv->rep_off = INVALID_ROFF;
renv->flags = 0;
renv->op_timestamp = renv->rep_timestamp = 0;
renv->mtx_regenv = MUTEX_INVALID;
/*
* Get the underlying REGION structure for this environment. Note,
* we created the underlying OS region before we acquired the REGION
* structure, which is backwards from the normal procedure. Update
* the REGION structure.
*/
if ((ret = __env_des_get(env, infop, infop, &rp)) != 0) {
find_err: __db_errx(env, "%s: unable to find environment", infop->name);
if (ret == 0)
ret = EINVAL;
goto err;
}
infop->rp = rp;
rp->size = tregion.size;
rp->segid = tregion.segid;
/*
* !!!
* If we create an environment where regions are public and in system
* memory, we have to inform processes joining the environment how to
* attach to the shared memory segment. So, we write the shared memory
* identifier into the file, to be read by those other processes.
*
* XXX
* This is really OS-layer information, but I can't see any easy way
* to move it down there without passing down information that it has
* no right to know, e.g., that this is the one-and-only REGENV region
* and not some other random region.
*/
if (tregion.segid != INVALID_REGION_SEGID) {
ref.size = tregion.size;
ref.segid = tregion.segid;
if ((ret = __os_write(
env, env->lockfhp, &ref, sizeof(ref), &nrw)) != 0) {
__db_err(env, ret,
"%s: unable to write out public environment ID",
infop->name);
goto err;
}
}
#ifndef HAVE_MUTEX_FCNTL
/*
* If we're not doing fcntl locking, we can close the file handle. We
* no longer need it and the less contact between the buffer cache and
* the VM, the better.
*/
if (env->lockfhp != NULL) {
(void)__os_closehandle(env, env->lockfhp);
env->lockfhp = NULL;
}
#endif
/* Everything looks good, we're done. */
env->reginfo = infop;
return (0);
err:
retry: /* Close any open file handle. */
if (env->lockfhp != NULL) {
(void)__os_closehandle(env, env->lockfhp);
env->lockfhp = NULL;
}
/*
* If we joined or created the region, detach from it. If we created
* it, destroy it. Note, there's a path in the above code where we're
* using a temporary REGION structure because we haven't yet allocated
* the real one. In that case the region address (addr) will be filled
* in, but the REGION pointer (rp) won't. Fix it.
*/
if (infop->addr != NULL) {
if (infop->rp == NULL)
infop->rp = &tregion;
/* Reset the addr value that we "corrected" above. */
infop->addr = infop->primary;
(void)__env_sys_detach(env,
infop, F_ISSET(infop, REGION_CREATE));
}
/* Free the allocated name and/or REGINFO structure. */
if (infop->name != NULL)
__os_free(env, infop->name);
__os_free(env, infop);
/* If we had a temporary error, wait awhile and try again. */
if (ret == 0) {
if (!retry_ok || ++retry_cnt > 3) {
__db_errx(env, "unable to join the environment");
ret = EAGAIN;
} else {
__os_yield(env, retry_cnt * 3, 0);
goto loop;
}
}
return (ret);
}
/*
* __env_turn_on --
* Turn on the created environment.
*
* PUBLIC: int __env_turn_on __P((ENV *));
*/
int
__env_turn_on(env)
ENV *env;
{
REGENV *renv;
REGINFO *infop;
infop = env->reginfo;
renv = infop->primary;
/* If we didn't create the region, there's no need for further work. */
if (!F_ISSET(infop, REGION_CREATE))
return (0);
/*
* Validate the file. All other threads of control are waiting
* on this value to be written -- "Let slip the hounds of war!"
*/
renv->magic = DB_REGION_MAGIC;
return (0);
}
/*
* __env_turn_off --
* Turn off the environment.
*
* PUBLIC: int __env_turn_off __P((ENV *, u_int32_t));
*/
int
__env_turn_off(env, flags)
ENV *env;
u_int32_t flags;
{
REGENV *renv;
REGINFO *infop;
int ret, t_ret;
ret = 0;
/*
* Connect to the environment: If we can't join the environment, we
* guess it's because it doesn't exist and we're done.
*
* If the environment exists, attach and lock the environment.
*/
if (__env_attach(env, NULL, 0, 1) != 0)
return (0);
infop = env->reginfo;
renv = infop->primary;
MUTEX_LOCK(env, renv->mtx_regenv);
/*
* If the environment is in use, we're done unless we're forcing the
* issue or the environment has panic'd. (If the environment panic'd,
* the thread holding the reference count may not have cleaned up, so
* we clean up. It's possible the application didn't plan on removing
* the environment in this particular call, but panic'd environments
* aren't useful to anyone.)
*
* Otherwise, panic the environment and overwrite the magic number so
* any thread of control attempting to connect (or racing with us) will
* back off and retry, or just die.
*/
if (renv->refcnt > 0 && !LF_ISSET(DB_FORCE) && !renv->panic)
ret = EBUSY;
else
renv->panic = 1;
/*
* Unlock the environment (nobody should need this lock because
* we've poisoned the pool) and detach from the environment.
*/
MUTEX_UNLOCK(env, renv->mtx_regenv);
if ((t_ret = __env_detach(env, 0)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
/*
* __env_panic_set --
* Set/clear unrecoverable error.
*
* PUBLIC: void __env_panic_set __P((ENV *, int));
*/
void
__env_panic_set(env, on)
ENV *env;
int on;
{
if (env != NULL && env->reginfo != NULL)
((REGENV *)env->reginfo->primary)->panic = on ? 1 : 0;
}
/*
* __env_ref_increment --
* Increment the environment's reference count.
*
* PUBLIC: int __env_ref_increment __P((ENV *));
*/
int
__env_ref_increment(env)
ENV *env;
{
REGENV *renv;
REGINFO *infop;
int ret;
infop = env->reginfo;
renv = infop->primary;
/* If we're creating the primary region, allocate a mutex. */
if (F_ISSET(infop, REGION_CREATE)) {
if ((ret = __mutex_alloc(
env, MTX_ENV_REGION, 0, &renv->mtx_regenv)) != 0)
return (ret);
renv->refcnt = 1;
} else {
/* Lock the environment, increment the reference, unlock. */
MUTEX_LOCK(env, renv->mtx_regenv);
++renv->refcnt;
MUTEX_UNLOCK(env, renv->mtx_regenv);
}
F_SET(env, ENV_REF_COUNTED);
return (0);
}
/*
* __env_ref_decrement --
* Decrement the environment's reference count.
*
* PUBLIC: int __env_ref_decrement __P((ENV *));
*/
int
__env_ref_decrement(env)
ENV *env;
{
REGENV *renv;
REGINFO *infop;
/* Be cautious -- we may not have an environment. */
if ((infop = env->reginfo) == NULL)
return (0);
renv = infop->primary;
/* Even if we have an environment, may not have reference counted it. */
if (F_ISSET(env, ENV_REF_COUNTED)) {
/* Lock the environment, decrement the reference, unlock. */
MUTEX_LOCK(env, renv->mtx_regenv);
if (renv->refcnt == 0)
__db_errx(env,
"environment reference count went negative");
else
--renv->refcnt;
MUTEX_UNLOCK(env, renv->mtx_regenv);
F_CLR(env, ENV_REF_COUNTED);
}
/* If a private environment, we're done with the mutex, destroy it. */
return (F_ISSET(env, ENV_PRIVATE) ?
__mutex_free(env, &renv->mtx_regenv) : 0);
}
/*
* __env_detach --
* Detach from the environment.
*
* PUBLIC: int __env_detach __P((ENV *, int));
*/
int
__env_detach(env, destroy)
ENV *env;
int destroy;
{
REGENV *renv;
REGINFO *infop;
REGION rp;
int ret, t_ret;
infop = env->reginfo;
renv = infop->primary;
ret = 0;
/* Close the locking file handle. */
if (env->lockfhp != NULL) {
if ((t_ret =
__os_closehandle(env, env->lockfhp)) != 0 && ret == 0)
ret = t_ret;
env->lockfhp = NULL;
}
/*
* If a private region, return the memory to the heap. Not needed for
* filesystem-backed or system shared memory regions, that memory isn't
* owned by any particular process.
*/
if (destroy) {
/*
* Free the REGION array.
*
* The actual underlying region structure is allocated from the
* primary shared region, and we're about to free it. Save a
* copy on our stack for the REGINFO to reference when it calls
* down into the OS layer to release the shared memory segment.
*/
rp = *infop->rp;
infop->rp = &rp;
if (renv->region_off != INVALID_ROFF)
__env_alloc_free(
infop, R_ADDR(infop, renv->region_off));
}
/*
* Set the ENV->reginfo field to NULL. BDB uses the ENV->reginfo
* field to decide if the underlying region can be accessed or needs
* cleanup. We're about to destroy what it references, so it needs to
* be cleared.
*/
env->reginfo = NULL;
/* Reset the addr value that we "corrected" above. */
infop->addr = infop->primary;
if ((t_ret = __env_sys_detach(env, infop, destroy)) != 0 && ret == 0)
ret = t_ret;
if (infop->name != NULL)
__os_free(env, infop->name);
/* Discard the ENV->reginfo field's memory. */
__os_free(env, infop);
return (ret);
}
/*
* __env_remove_env --
* Remove an environment.
*
* PUBLIC: int __env_remove_env __P((ENV *));
*/
int
__env_remove_env(env)
ENV *env;
{
DB_ENV *dbenv;
REGENV *renv;
REGINFO *infop, reginfo;
REGION *rp;
u_int32_t flags_orig, i;
dbenv = env->dbenv;
/*
* We do not want to hang on a mutex request, nor do we care about
* panics.
*/
flags_orig = F_ISSET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
F_SET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
/*
* This routine has to walk a nasty line between not looking into the
* environment (which may be corrupted after an app or system crash),
* and removing everything that needs removing.
*
* Connect to the environment: If we can't join the environment, we
* guess it's because it doesn't exist. Remove the underlying files,
* at least.
*/
if (__env_attach(env, NULL, 0, 0) != 0)
goto remfiles;
infop = env->reginfo;
renv = infop->primary;
/*
* Kill the environment, if it's not already dead.
*/
renv->panic = 1;
/*
* Walk the array of regions. Connect to each region and disconnect
* with the destroy flag set. This shouldn't cause any problems, even
* if the region is corrupted, because we never look inside the region
* (with the single exception of mutex regions on systems where we have
* to return resources to the underlying system).
*/
for (rp = R_ADDR(infop, renv->region_off),
i = 0; i < renv->region_cnt; ++i, ++rp) {
if (rp->id == INVALID_REGION_ID || rp->type == REGION_TYPE_ENV)
continue;
/*
* !!!
* The REGION_CREATE_OK flag is set for Windows/95 -- regions
* are zero'd out when the last reference to the region goes
* away, in which case the underlying OS region code requires
* callers be prepared to create the region in order to join it.
*/
memset(&reginfo, 0, sizeof(reginfo));
reginfo.id = rp->id;
reginfo.flags = REGION_CREATE_OK;
/*
* If we get here and can't attach and/or detach to the
* region, it's a mess. Ignore errors, there's nothing
* we can do about them.
*/
if (__env_region_attach(env, &reginfo, 0) != 0)
continue;
#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
/*
* If destroying the mutex region, return any system
* resources to the system.
*/
if (reginfo.type == REGION_TYPE_MUTEX)
__mutex_resource_return(env, &reginfo);
#endif
(void)__env_region_detach(env, &reginfo, 1);
}
/* Detach from the environment's primary region. */
(void)__env_detach(env, 1);
remfiles:
/*
* Walk the list of files in the directory, unlinking files in the
* Berkeley DB name space.
*/
__env_remove_file(env);
F_CLR(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
F_SET(dbenv, flags_orig);
return (0);
}
/*
* __env_remove_file --
* Discard any region files in the filesystem.
*/
static void
__env_remove_file(env)
ENV *env;
{
int cnt, fcnt, lastrm, ret;
const char *dir;
char saved_char, *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20];
/* Get the full path of a file in the environment. */
(void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
if ((ret = __db_appname(env, DB_APP_NONE, buf, 0, NULL, &path)) != 0)
return;
/* Get the parent directory for the environment. */
if ((p = __db_rpath(path)) == NULL) {
p = path;
saved_char = *p;
dir = PATH_DOT;
} else {
saved_char = *p;
*p = '\0';
dir = path;
}
/* Get the list of file names. */
if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0)
__db_err(env, ret, "%s", dir);
/* Restore the path, and free it. */
*p = saved_char;
__os_free(env, path);
if (ret != 0)
return;
/*
* Remove files from the region directory.
*/
for (lastrm = -1, cnt = fcnt; --cnt >= 0;) {
/* Skip anything outside our name space. */
if (strncmp(names[cnt],
DB_REGION_PREFIX, sizeof(DB_REGION_PREFIX) - 1))
continue;
/* Skip queue extent files. */
if (strncmp(names[cnt], "__dbq.", 6) == 0)
continue;
/* Skip registry files. */
if (strncmp(names[cnt], "__db.register", 13) == 0)
continue;
/* Skip replication files. */
if (strncmp(names[cnt], "__db.rep", 8) == 0)
continue;
/*
* Remove the primary environment region last, because it's
* the key to this whole mess.
*/
if (strcmp(names[cnt], DB_REGION_ENV) == 0) {
lastrm = cnt;
continue;
}
/* Remove the file. */
if (__db_appname(env,
DB_APP_NONE, names[cnt], 0, NULL, &path) == 0) {
/*
* Overwrite region files. Temporary files would have
* been maintained in encrypted format, so there's no
* reason to overwrite them. This is not an exact
* check on the file being a region file, but it's
* not likely to be wrong, and the worst thing that can
* happen is we overwrite a file that didn't need to be
* overwritten.
*/
(void)__os_unlink(env, path, 1);
__os_free(env, path);
}
}
if (lastrm != -1)
if (__db_appname(env,
DB_APP_NONE, names[lastrm], 0, NULL, &path) == 0) {
(void)__os_unlink(env, path, 1);
__os_free(env, path);
}
__os_dirfree(env, names, fcnt);
}
/*
* __env_region_attach
* Join/create a region.
*
* PUBLIC: int __env_region_attach __P((ENV *, REGINFO *, size_t));
*/
int
__env_region_attach(env, infop, size)
ENV *env;
REGINFO *infop;
size_t size;
{
REGION *rp;
int ret;
char buf[sizeof(DB_REGION_FMT) + 20];
/*
* Find or create a REGION structure for this region. If we create
* it, the REGION_CREATE flag will be set in the infop structure.
*/
F_CLR(infop, REGION_CREATE);
if ((ret = __env_des_get(env, env->reginfo, infop, &rp)) != 0)
return (ret);
infop->env = env;
infop->rp = rp;
infop->type = rp->type;
infop->id = rp->id;
/*
* __env_des_get may have created the region and reset the create
* flag. If we're creating the region, set the desired size.
*/
if (F_ISSET(infop, REGION_CREATE))
rp->size = (roff_t)size;
/* Join/create the underlying region. */
(void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id);
if ((ret = __db_appname(env,
DB_APP_NONE, buf, 0, NULL, &infop->name)) != 0)
goto err;
if ((ret = __env_sys_attach(env, infop, rp)) != 0)
goto err;
/*
* Fault the pages into memory. Note, do this BEFORE we initialize
* anything because we're writing pages in created regions, not just
* reading them.
*/
(void)__env_faultmem(env,
infop->addr, rp->size, F_ISSET(infop, REGION_CREATE));
/*
* !!!
* The underlying layer may have just decided that we are going
* to create the region. There are various system issues that
* can result in a useless region that requires re-initialization.
*
* If we created the region, initialize it for allocation.
*/
if (F_ISSET(infop, REGION_CREATE))
__env_alloc_init(infop, rp->size);
return (0);
err: /* Discard the underlying region. */
if (infop->addr != NULL)
(void)__env_sys_detach(env,
infop, F_ISSET(infop, REGION_CREATE));
infop->rp = NULL;
infop->id = INVALID_REGION_ID;
/* Discard the REGION structure if we created it. */
if (F_ISSET(infop, REGION_CREATE)) {
__env_des_destroy(env, rp);
F_CLR(infop, REGION_CREATE);
}
return (ret);
}
/*
* __env_region_detach --
* Detach from a region.
*
* PUBLIC: int __env_region_detach __P((ENV *, REGINFO *, int));
*/
int
__env_region_detach(env, infop, destroy)
ENV *env;
REGINFO *infop;
int destroy;
{
REGION *rp;
int ret;
rp = infop->rp;
if (F_ISSET(env, ENV_PRIVATE))
destroy = 1;
/*
* When discarding the regions as we shut down a database environment,
* discard any allocated shared memory segments. This is the last time
* we use them, and db_region_destroy is the last region-specific call
* we make.
*/
if (F_ISSET(env, ENV_PRIVATE) && infop->primary != NULL)
__env_alloc_free(infop, infop->primary);
/* Detach from the underlying OS region. */
ret = __env_sys_detach(env, infop, destroy);
/* If we destroyed the region, discard the REGION structure. */
if (destroy)
__env_des_destroy(env, rp);
/* Destroy the structure. */
if (infop->name != NULL)
__os_free(env, infop->name);
return (ret);
}
/*
* __env_sys_attach --
* Prep and call the underlying OS attach function.
*/
static int
__env_sys_attach(env, infop, rp)
ENV *env;
REGINFO *infop;
REGION *rp;
{
int ret;
/*
* All regions are created on 8K boundaries out of sheer paranoia,
* so we don't make some underlying VM unhappy. Make sure we don't
* overflow or underflow.
*/
#define OS_VMPAGESIZE (8 * 1024)
#define OS_VMROUNDOFF(i) { \
if ((i) < \
(UINT32_MAX - OS_VMPAGESIZE) + 1 || (i) < OS_VMPAGESIZE) \
(i) += OS_VMPAGESIZE - 1; \
(i) -= (i) % OS_VMPAGESIZE; \
}
OS_VMROUNDOFF(rp->size);
#ifdef DB_REGIONSIZE_MAX
/* Some architectures have hard limits on the maximum region size. */
if (rp->size > DB_REGIONSIZE_MAX) {
__db_errx(env, "region size %lu is too large; maximum is %lu",
(u_long)rp->size, (u_long)DB_REGIONSIZE_MAX);
return (EINVAL);
}
#endif
/*
* If a region is private, malloc the memory.
*
* !!!
* If this fails because the region is too large to malloc, mmap(2)
* using the MAP_ANON or MAP_ANONYMOUS flags would be an alternative.
* I don't know of any architectures (yet!) where malloc is a problem.
*/
if (F_ISSET(env, ENV_PRIVATE)) {
#if defined(HAVE_MUTEX_HPPA_MSEM_INIT)
/*
* !!!
* There exist spinlocks that don't work in malloc memory, e.g.,
* the HP/UX msemaphore interface. If we don't have locks that
* will work in malloc memory, we better not be private or not
* be threaded.
*/
if (F_ISSET(env, ENV_THREAD)) {
__db_errx(env, "%s",
"architecture does not support locks inside process-local (malloc) memory");
__db_errx(env, "%s",
"application may not specify both DB_PRIVATE and DB_THREAD");
return (EINVAL);
}
#endif
if ((ret = __os_malloc(
env, sizeof(REGENV), &infop->addr)) != 0)
return (ret);
infop->max_alloc = rp->size;
} else
if ((ret = __os_attach(env, infop, rp)) != 0)
return (ret);
/*
* We may require alignment the underlying system or heap allocation
* library doesn't supply. Align the address if necessary, saving
* the original values for restoration when the region is discarded.
*/
infop->addr_orig = infop->addr;
infop->addr = ALIGNP_INC(infop->addr_orig, sizeof(size_t));
rp->size_orig = rp->size;
if (infop->addr != infop->addr_orig)
rp->size -= (roff_t)
((u_int8_t *)infop->addr - (u_int8_t *)infop->addr_orig);
return (0);
}
/*
* __env_sys_detach --
* Prep and call the underlying OS detach function.
*/
static int
__env_sys_detach(env, infop, destroy)
ENV *env;
REGINFO *infop;
int destroy;
{
REGION *rp;
rp = infop->rp;
/* Restore any address/size altered for alignment reasons. */
if (infop->addr != infop->addr_orig) {
infop->addr = infop->addr_orig;
rp->size = rp->size_orig;
}
/* If a region is private, free the memory. */
if (F_ISSET(env, ENV_PRIVATE)) {
__os_free(env, infop->addr);
return (0);
}
return (__os_detach(env, infop, destroy));
}
/*
* __env_des_get --
* Return a reference to the shared information for a REGION,
* optionally creating a new entry.
*/
static int
__env_des_get(env, env_infop, infop, rpp)
ENV *env;
REGINFO *env_infop, *infop;
REGION **rpp;
{
REGENV *renv;
REGION *rp, *empty_slot, *first_type;
u_int32_t i, maxid;
*rpp = NULL;
renv = env_infop->primary;
/*
* If the caller wants to join a region, walk through the existing
* regions looking for a matching ID (if ID specified) or matching
* type (if type specified). If we return based on a matching type
* return the "primary" region, that is, the first region that was
* created of this type.
*
* Track the first empty slot and maximum region ID for new region
* allocation.
*
* MaxID starts at REGION_ID_ENV, the ID of the primary environment.
*/
maxid = REGION_ID_ENV;
empty_slot = first_type = NULL;
for (rp = R_ADDR(env_infop, renv->region_off),
i = 0; i < renv->region_cnt; ++i, ++rp) {
if (rp->id == INVALID_REGION_ID) {
if (empty_slot == NULL)
empty_slot = rp;
continue;
}
if (infop->id != INVALID_REGION_ID) {
if (infop->id == rp->id)
break;
continue;
}
if (infop->type == rp->type &&
F_ISSET(infop, REGION_JOIN_OK) &&
(first_type == NULL || first_type->id > rp->id))
first_type = rp;
if (rp->id > maxid)
maxid = rp->id;
}
/* If we found a matching ID (or a matching type), return it. */
if (i >= renv->region_cnt)
rp = first_type;
if (rp != NULL) {
*rpp = rp;
return (0);
}
/*
* If we didn't find a region and we don't have permission to create
* the region, fail. The caller generates any error message.
*/
if (!F_ISSET(infop, REGION_CREATE_OK))
return (ENOENT);
/*
* If we didn't find a region and don't have room to create the region
* fail with an error message, there's a sizing problem.
*/
if (empty_slot == NULL) {
__db_errx(env, "no room remaining for additional REGIONs");
return (ENOENT);
}
/*
* Initialize a REGION structure for the caller. If id was set, use
* that value, otherwise we use the next available ID.
*/
memset(empty_slot, 0, sizeof(REGION));
empty_slot->segid = INVALID_REGION_SEGID;
/*
* Set the type and ID; if no region ID was specified,
* allocate one.
*/
empty_slot->type = infop->type;
empty_slot->id = infop->id == INVALID_REGION_ID ? maxid + 1 : infop->id;
F_SET(infop, REGION_CREATE);
*rpp = empty_slot;
return (0);
}
/*
* __env_des_destroy --
* Destroy a reference to a REGION.
*/
static void
__env_des_destroy(env, rp)
ENV *env;
REGION *rp;
{
COMPQUIET(env, NULL);
rp->id = INVALID_REGION_ID;
}
/*
* __env_faultmem --
* Fault the region into memory.
*/
static int
__env_faultmem(env, addr, size, created)
ENV *env;
void *addr;
size_t size;
int created;
{
int ret;
u_int8_t *p, *t;
/* Ignore heap regions. */
if (F_ISSET(env, ENV_PRIVATE))
return (0);
/*
* It's sometimes significantly faster to page-fault in all of the
* region's pages before we run the application, as we see nasty
* side-effects when we page-fault while holding various locks, i.e.,
* the lock takes a long time to acquire because of the underlying
* page fault, and the other threads convoy behind the lock holder.
*
* If we created the region, we write a non-zero value so that the
* system can't cheat. If we're just joining the region, we can
* only read the value and try to confuse the compiler sufficiently
* that it doesn't figure out that we're never really using it.
*
* Touch every page (assuming pages are 512B, the smallest VM page
* size used in any general purpose processor).
*/
ret = 0;
if (F_ISSET(env->dbenv, DB_ENV_REGION_INIT)) {
if (created)
for (p = addr,
t = (u_int8_t *)addr + size; p < t; p += 512)
p[0] = 0xdb;
else
for (p = addr,
t = (u_int8_t *)addr + size; p < t; p += 512)
ret |= p[0];
}
return (ret);
}