stats.c   [plain text]


/* -*- c-file-style: "java"; indent-tabs-mode: nil; tab-width: 4; fill-column: 78 -*-
 *
 * distcc -- A simple distributed compiler system
 *
 * Copyright 2005 Google Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
 * USA.
 */

/* Author: Thomas Kho */

#include <config.h>

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <signal.h>
#include <string.h>
#include <time.h>

#include <sys/socket.h>
#include <sys/select.h>
#include <sys/statvfs.h>

#include "exitcode.h"
#include "distcc.h"
#include "trace.h"
#include "dopt.h"
#include "stats.h"
#include "srvnet.h"
#include "util.h"
#include "netutil.h"
#include "fcntl.h"
#include "daemon.h"

int dcc_statspipe[2];

#define MAX_FILENAME_LEN 1024

/* in prefork.c */
void dcc_manage_kids(int listen_fd);

struct stats_s {
    int counters[STATS_ENUM_MAX];
    int longest_job_time;
    char longest_job_name[MAX_FILENAME_LEN];
    char longest_job_compiler[MAX_FILENAME_LEN];
    int io_rate; /* read/write sectors per second */

    int compile_timeseries[300]; /* 300 3-sec time intervals */
} dcc_stats;

struct statsdata {
    enum stats_e type;

    /* used only for STATS_COMPILE_OK */
    int time;
    char filename[MAX_FILENAME_LEN];
    char compiler[MAX_FILENAME_LEN];
};

const char *stats_text[20] = { "TCP_ACCEPT", "REJ_BAD_REQ", "REJ_OVERLOAD",
    "COMPILE_OK", "COMPILE_ERROR", "COMPILE_TIMEOUT", "CLI_DISCONN",
    "OTHER" };

/* Call this to initialize stats */
int dcc_stats_init() {
    if (arg_stats) {
        if (pipe(dcc_statspipe) == -1) {
            return -1;
        }
    }
    memset(&dcc_stats, 0, sizeof(dcc_stats));
    return 0;
}


/* In the preforking model call this to initialize stats for forked children */
void dcc_stats_init_kid() {
    if (arg_stats) {
        close(dcc_statspipe[0]);
    }
}


/**
 * Logs countable event of type e to stats server
 **/
void dcc_stats_event(enum stats_e e) {
    if (arg_stats) {
        struct statsdata sd;
        memset(&sd, 0, sizeof(sd));
        sd.type = e;
        write(dcc_statspipe[1], &sd, sizeof(sd));
    }
}


/**
 * Logs a completed job to stats server
 **/
void dcc_stats_compile_ok(char *compiler, char *filename, int time_usec) {
    if (arg_stats) {
        struct statsdata sd;
        memset(&sd, 0, sizeof(sd));

        sd.type = STATS_COMPILE_OK;
        /* also send compiler, filename & runtime */
        sd.time = time_usec;
        strncpy(sd.filename, filename, MAX_FILENAME_LEN);
        strncpy(sd.compiler, compiler, MAX_FILENAME_LEN);
        write(dcc_statspipe[1], &sd, sizeof(sd));
    }
}


/*
 * Updates a running total of compiles in the last 1, 5, 15 minutes
 *
 * Returns the oldest slot
 */
static int dcc_stats_update_running_total(int increment) {
    static int prev_slot;
    static time_t last = 0;
    static int total = 0;
    int i;
    int cur_slot;
    int *const cts = dcc_stats.compile_timeseries;
    time_t now = time(NULL);

    cur_slot = (now / 3) % 300;

    if (last + 900 < now) {
        /* reset all stats; last call was >15 min ago */
        for (i = 0; i < 300; i++)
            cts[i] = total;
        prev_slot = cur_slot;
    }

    if (prev_slot != cur_slot) {
        /* different timeslot, so set the interval [prev, cur) */
        for (i = (prev_slot)%300; i != cur_slot; i = (i+1)%300) {
            cts[i] = total;
        }
        prev_slot = cur_slot;
    }

    total += increment;
    last = now;

    return cur_slot;
}


static int *dcc_stats_get_compile_totals(void) {
    int cur_slot;
    static int ct[3]; /* 1, 5, 15 min compile totals */
    int *const cts = dcc_stats.compile_timeseries;

    cur_slot = dcc_stats_update_running_total(0);

    ct[0] = cts[(cur_slot + 299)%300] - cts[(cur_slot+280)%300];
    ct[1] = cts[(cur_slot + 299)%300] - cts[(cur_slot+200)%300];
    ct[2] = cts[(cur_slot + 299)%300] - cts[(cur_slot+1)%300];

    return ct;
}


/* Sets dcc_stats.io_rate at most 50 secs */
static void dcc_stats_minutely_update(void) {
    static int prev_io_tot = -1;
    static time_t last = 0;
    int n_reads, n_writes;
    time_t now = time(NULL);

    if (last + 50 < now) {
        dcc_get_disk_io_stats(&n_reads, &n_writes);

        if (prev_io_tot == -1)
            dcc_stats.io_rate = -1;
        else
            dcc_stats.io_rate = (n_reads + n_writes - prev_io_tot) / (now - last);

        prev_io_tot = n_reads + n_writes;
        last = now;
    }
}

static long dcc_get_tmpdirinfo(void) {
    const char *tmp_dir;
    struct statvfs buf;

    if (dcc_get_tmp_top(&tmp_dir) != 0)
        return -1;

    if (statvfs(tmp_dir, &buf) != 0)
        return -1;

    if (buf.f_bsize >= 1024)
        return buf.f_bavail * (buf.f_bsize / 1024) / 1024;
    else
        return (buf.f_bavail * buf.f_bsize) / (1024 * 1024);
}

/**
 * Accept a connection on the stats port and send the reply, regardless of data
 * that the client sends us.
 **/
static void dcc_service_stats_request(int http_fd) {
    int acc_fd;
    int *ct;
    int num_D;
    int max_RSS;
    char *max_RSS_name;
    size_t reply_len;
    char challenge[1024];
    char reply[2048];
    struct dcc_sockaddr_storage cli_addr;
    socklen_t cli_len = sizeof(cli_addr);
    double loadavg[3];
    int free_space_mb;

    const char replytemplate[] = "\
HTTP/1.0 200 OK\n\
Content-Type: text/plain\n\
Connection: close\n\n\
argv /distccd\n\
<distccstats>\n\
dcc_tcp_accept %d\n\
dcc_rej_bad_req %d\n\
dcc_rej_overload %d\n\
dcc_compile_ok %d\n\
dcc_compile_error %d\n\
dcc_compile_timeout %d\n\
dcc_cli_disconnect %d\n\
dcc_other %d\n\
dcc_longest_job %s\n\
dcc_longest_job_compiler %s\n\
dcc_longest_job_time_msecs %d\n\
dcc_max_kids %d\n\
dcc_current_load %d\n\
dcc_load1 %1.2lf\n\
dcc_load2 %1.2lf\n\
dcc_load3 %1.2lf\n\
dcc_num_compiles1 %d\n\
dcc_num_compiles2 %d\n\
dcc_num_compiles3 %d\n\
dcc_num_procstate_D %d\n\
dcc_max_RSS %d\n\
dcc_max_RSS_name %s\n\
dcc_io_rate %d\n\
dcc_free_space %d MB\n\
</distccstats>\n";

    dcc_stats_minutely_update(); /* force update to get fresh disk io data */
    ct = dcc_stats_get_compile_totals();
    dcc_getloadavg(loadavg);

    free_space_mb = dcc_get_tmpdirinfo();
    dcc_get_proc_stats(&num_D, &max_RSS, &max_RSS_name);

    if (dcc_stats.longest_job_name[0] == 0)
        strcpy(dcc_stats.longest_job_name, "none");
    if (dcc_stats.longest_job_compiler[0] == 0)
        strcpy(dcc_stats.longest_job_compiler, "none");

    acc_fd = accept(http_fd, (struct sockaddr *) &cli_addr, &cli_len);
    if (dcc_check_client((struct sockaddr *)&cli_addr,
                         (int) cli_len,
                         opt_allowed) == 0) {
        reply_len = snprintf(reply, 2048, replytemplate,
                               dcc_stats.counters[STATS_TCP_ACCEPT],
                               dcc_stats.counters[STATS_REJ_BAD_REQ],
                               dcc_stats.counters[STATS_REJ_OVERLOAD],
                               dcc_stats.counters[STATS_COMPILE_OK],
                               dcc_stats.counters[STATS_COMPILE_ERROR],
                               dcc_stats.counters[STATS_COMPILE_TIMEOUT],
                               dcc_stats.counters[STATS_CLI_DISCONN],
                               dcc_stats.counters[STATS_OTHER],
                               dcc_stats.longest_job_name,
                               dcc_stats.longest_job_compiler,
                               dcc_stats.longest_job_time,
                               dcc_max_kids,
                               dcc_getcurrentload(),
                               loadavg[0], loadavg[1], loadavg[2],
                               ct[0], ct[1], ct[2],
                               num_D, max_RSS, max_RSS_name,
                               dcc_stats.io_rate,
                               free_space_mb);
        dcc_set_nonblocking(acc_fd);
        read(acc_fd, challenge, 1024); /* empty the receive queue */
        write(acc_fd, reply, reply_len);
    }

    /* Don't think we need this to prevent RST anymore, since we read() now */
#if 0
    shutdown(acc_fd, SHUT_WR); /* prevent connection reset */
#endif
    dcc_close(acc_fd);
}


/**
 * Process a packet of stats data
 **/
static void dcc_stats_process(struct statsdata *sd) {
    if (sd->type > STATS_ENUM_MAX) {
        /* Got a bad message */
        return;
    }

    switch (sd->type) {

    case STATS_TCP_ACCEPT:
    case STATS_REJ_BAD_REQ:
    case STATS_REJ_OVERLOAD:
        break;
    case STATS_COMPILE_OK:
        /* Record file with longest runtime */
        if (dcc_stats.longest_job_time < sd->time) {
            dcc_stats.longest_job_time = sd->time;
            strncpy(dcc_stats.longest_job_name, sd->filename,
                    MAX_FILENAME_LEN);
            strncpy(dcc_stats.longest_job_compiler, sd->compiler,
                    MAX_FILENAME_LEN);
        }
    case STATS_COMPILE_ERROR:
    case STATS_COMPILE_TIMEOUT:
    case STATS_CLI_DISCONN:
        /* We want to update the running compile total for all jobs that
         * non-trivially tax the CPU */
        dcc_stats_update_running_total(1);
    default: ;
    }

    dcc_stats.counters[sd->type]++;
}


/**
 * Collect runtime statistics from kids and serve them via HTTP
 * Also, maintains the pool of kids.
 **/
int dcc_stats_server(int listen_fd)
{
    int http_fd, max_fd;
    int i, ret;
    fd_set fds, fds_master;
    struct statsdata sd;
    struct timeval timeout;

    /* clear stats data */
    for (i = 0; i < STATS_ENUM_MAX; i++)
        dcc_stats.counters[i] = 0;
    dcc_stats.longest_job_time = -1;
    dcc_stats.longest_job_name[0] = 0;
    dcc_stats.io_rate = -1;

    if ((ret = dcc_socket_listen(arg_stats_port, &http_fd,
                                    opt_listen_addr)) != 0) {
        return ret;
    }
    rs_log_info("HTTP server started on port %d\n", arg_stats_port);

    /* We don't want children to inherit this FD */
    fcntl(http_fd, F_SETFD, FD_CLOEXEC);

    max_fd = (http_fd > dcc_statspipe[0]) ? (http_fd + 1)
                                           : (dcc_statspipe[0] + 1);

    FD_ZERO(&fds_master);
    FD_SET(dcc_statspipe[0], &fds_master);
    FD_SET(http_fd, &fds_master);

    while (1) {
        dcc_stats_minutely_update();

        timeout.tv_sec = 60;
        timeout.tv_usec = 0;
        fds = fds_master;
        ret = select(max_fd, &fds, NULL, NULL, &timeout);
        if (ret != -1) {
            if (FD_ISSET(dcc_statspipe[0], &fds)) {
                /* Received stats report from a child */
                if (read(dcc_statspipe[0], &sd, sizeof(sd)) != -1) {
                    dcc_stats_process(&sd);
                }
            }

            if (FD_ISSET(http_fd, &fds)) {
                /* Received request on stats reporting port */
                dcc_service_stats_request(http_fd);
            }
        } else {
            if (errno == EINTR) {
                /* Interrupted -- SIGCHLD? */
            }
        }

        dcc_manage_kids(listen_fd);
    }
}