/*-
 * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id$
 * $FreeBSD$
 *
 * A simple algorithm for Anticipatory disk Scheduler (AS).
 * This version does not track process state or behaviour, and is
 * just a proof of concept to show how non work-conserving policies
 * can be implemented within this framework.
 */ 

/*
 * Anticipatory scheduling without per-client state.
 * 
 * The goal of this implementation is to improve throughput compared
 * to the pure elevator algorithm, but also make sure that clients
 * do not starve.
 * 
 * To this purpose, we use anticipation to help reducing seeks, but
 * set a maximum service share (in time or data) to prevent starvation.
 * Also, we mark requests as "good" or "bad" depending on how well they
 * lend to clustering. To improve throughput, we try to avoid serving
 * too many "bad" requests in a row.
 *  
 * The scheduler can be in one of three states
 *     READY	immediately serve the first pending request
 *     BUSY	one request is under service, wait for completion
 *     IDLING	do not serve incoming requests immediately, unless
 * 		they are "eligible" as defined later.
 * 
 * The event that triggers the state machine is mainly gs_next(),
 * the body of the dispatch loop, called after all events.
 * The calls to the scheduling algorithm (gs_start(), gs_done()
 * and gs_timeout()) do the following:
 *
 *     gs_start()	arrival of a new request.
 * 		Just enqueue; the dispatch loop will be called
 *		right after.
 * 
 *     gs_done()	completion of the request under service.
 * 		Set state to IDLING, start timer and
 * 		call the dispatch loop.
 * 
 *     gs_timeout()	timeout while idling.
 * 		Set state to READY, call the dispatch loop.
 * 
 * In turn, gs_next() does the following (in pseudocode,
 * see the implementation for more details):
 *
 * 	bio = <request at head of queue>
 * 	if (bio == NULL) {
 * 		state = READY; 
 * 	} else if (state == BUSY) {
 * 		bio = NULL; // do nothing
 * 	} else if (state == READY) {
 * 		state = BUSY; // dispatch request
 * 	} else { // state is IDLING, main anticipation logic goes here
 * 		if (!in_sequence(bio) && !expired()) {
 * 			bio = NULL; // do nothing
 * 		} else if (!expired()) {
 * 			state = BUSY; // dispatch request
 * 		} else {
 * 			mark(bio, GOOD);
 * 			rotate_queue();
 * 			bio = <request at head>// surely not null
 * 			state = BUSY; // dispatch request
 * 		}
 * 	}
 * 	if (bio != NULL)
 * 		<extract bio from queue>; stop timer;
 * 	return bio;
 * 
 * in_sequence() returns true if the request is right after the
 * 	current head position (so it will not cause a seek).
 *
 * expired(bio) returns true if the current sequence is longer
 *	than the maximum allowed.
 *
 * rotate_queue() implements the selection of the next batch.
 *	In the simplest	case we just jump to the next request
 *	in round-robin order; more complex policies are possible
 *	(and likely necessary) to avoid that a seeky client
 *	receives an exceedingly high amount of service.
 */

#include <sys/cdefs.h>

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/bio.h>
#include <sys/callout.h>
#include <sys/proc.h>
#include <sys/taskqueue.h>

#include "gs_scheduler.h"

/*
 * Status values for AS.
 */
enum g_as_status {
	G_AS_READY =	0,	/* Not waiting at all. */
	G_AS_BUSY,		/* Waiting a request to complete. */
	G_AS_IDLING		/* Waiting a new request. */
};

struct g_as_softc {
	struct g_geom		*sc_geom;
	enum g_as_status	sc_status;
	long			sc_service;	/* received so far */
	int			sc_start_tick;	/* starting tick */

	off_t			sc_last_offset;

	/* configuration parameters */
	int			sc_wait_ticks;
	int			sc_budget;
	int			sc_max_ticks;

	struct callout		sc_wait;
	struct bio_queue_head	sc_bioq;
};

static int
g_as_in_sequence(struct g_as_softc *sc, struct bio *bp)
{

	return (bp->bio_offset == sc->sc_last_offset);
}

static int
g_as_expired(struct g_as_softc *sc)
{

	return (sc->sc_service > sc->sc_budget ||
	    ticks - sc->sc_start_tick > sc->sc_max_ticks);
}

static struct bio *
g_as_rotate(struct bio_queue_head *head)
{
	struct bio *bp;

	bp = gs_bioq_takefirst(head);
	gs_bioq_disksort(head, bp);

	return (gs_bioq_first(head));
}

/*
 * The body of the dispatch loop. Returns the bio to be dispatched
 * (and in case set sc_status == G_AS_BUSY)
 * or NULL when no bio needs to be dispatched.
 */
static struct bio *
g_as_next(void *data, int force)
{
	struct g_as_softc *sc = data;
	struct bio *bio;

	bio = gs_bioq_first(&sc->sc_bioq);
	if (bio == NULL || force) {
		/* reset parameters */
		sc->sc_start_tick = ticks;
		sc->sc_service = 0;
		if (force)
			sc->sc_status = G_AS_READY;
	} else if (sc->sc_status == G_AS_BUSY) {
		/* We were called after _start. */
		bio = NULL;
	} else if (sc->sc_status == G_AS_READY) {
		/* Dispatch the request. */
		sc->sc_status = G_AS_BUSY;
	} else {	/* we are IDLING here */
		if (!g_as_in_sequence(sc, bio) && !g_as_expired(sc)) {
			bio = NULL;
		} else if (!g_as_expired(sc)) {
			sc->sc_status = G_AS_BUSY;
		} else {
			bio = g_as_rotate(&sc->sc_bioq);
			sc->sc_start_tick = ticks;
			sc->sc_service = 0;
		}
	}

	if (bio != NULL) {
		/* Remove the bio we decided to serve. */
		gs_bioq_remove(&sc->sc_bioq, bio);
		sc->sc_service += bio->bio_length;
		sc->sc_last_offset = bio->bio_offset + bio->bio_length;
		callout_stop(&sc->sc_wait);
	}

	return (bio);
}

static void
g_as_wait_timeout(void *data)
{
	struct g_as_softc *sc = data;
	struct g_geom *geom = sc->sc_geom;

	g_sched_lock(geom);
	/*
	 * If we timed out waiting for a new request for the current
	 * client, just dispatch whatever we have.
	 * Otherwise ignore the timeout (should not happen).
	 */
	if (sc->sc_status == G_AS_IDLING) {
		sc->sc_status = G_AS_READY;
		g_sched_dispatch(geom);
	}
	g_sched_unlock(geom);
}

/*
 * Called when there is a schedulable disk I/O request.
 * Just enqueue, the dispatch loop will take care of things.
 */
static int
g_as_start(void *data, struct bio *bio)
{
	struct g_as_softc *sc = data;

	gs_bioq_disksort(&sc->sc_bioq, bio);
	return 0;
}

/*
 * Callback from the geom when a request is complete.
 * Change to idling, start timer and call the dispatch loop.
 */
static void
g_as_done(void *data, struct bio *bio)
{
	struct g_as_softc *sc = data;

	sc->sc_status = G_AS_IDLING;
	callout_reset(&sc->sc_wait, sc->sc_wait_ticks,
		    g_as_wait_timeout, sc);
	g_sched_dispatch(sc->sc_geom);
}

/*
 * Module glue, called when the module is loaded.
 * Allocate a descriptor and initialize its fields, including the
 * callout queue for timeouts, and a bioq to store pending requests.
 *
 * The fini routine deallocates everything.
 */
static void *
g_as_init(struct g_geom *geom)
{
	struct g_as_softc *sc;

	sc = malloc(sizeof(*sc), M_GEOM_SCHED, M_WAITOK | M_ZERO);
	sc->sc_geom = geom;
	sc->sc_status = G_AS_READY;
	sc->sc_wait_ticks = (hz >= 400) ? hz/200 : 2;
	sc->sc_max_ticks = (hz >= 40) ? hz/20 : 2;
	sc->sc_budget = 0x00800000;	/* 8 MB */

	callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
	gs_bioq_init(&sc->sc_bioq);

	return sc;
}

static void
g_as_fini(void *data)
{
	struct g_as_softc *sc = data;

	/*
	 * geom should guarantee that _fini is only called when there
	 * are no more bio's active (GEOM does not know about the queue,
	 * but it can count existing bio's associated to the geom).
	 */
	KASSERT(gs_bioq_first(&sc->sc_bioq) == NULL,
	    ("Requests still pending."));
	callout_drain(&sc->sc_wait);

	free(sc, M_GEOM_SCHED);
}

static struct g_gsched g_as = {
	.gs_name = "as",
	.gs_init = g_as_init,
	.gs_fini = g_as_fini,
	.gs_start = g_as_start,
	.gs_done = g_as_done,
	.gs_next = g_as_next,
};

DECLARE_GSCHED_MODULE(as, &g_as);
