/////////////////////////////////////////////////////////
// Class	    	:   BRL
// Author	    	:	F. Meyer
// Description	    :   - implementation of the SARSA algorithm
//						- Reinforcement learning 
//						from here
// Last Modified    :	03.02.1998
/////////////////////////////////////////////////////////

import java.awt.*;

public class BRL implements Runnable {

    // CONSTANTES
	
    // type d'apprentissage
    public static final int SARSA = 1;
    // public static final int QLEARNING = 2;
    
    // type de selection d'action
    public static final int E_GREEDY = 1;
    public static final int SOFTMAX = 2;
    
    // Joueurs
    public static final int DEALER = 0;
    public static final int COMPUTER = 1;
    
    // Actions possibles
    public static final int STAND = 1;
    public static final int HIT = 0;

    private boolean threadSuspended = false;
    
    // VARIABLES
    int learningType;
    int actionSelect;
    
    BRLPanel parent;
    
    int numberOfStates;
    BStrategy Q;
    int cards[];
    boolean playedCards[];
    
    Frame graphFrame;
    BGraph2D graph;
    
    int goodReward;
    int badReward;
    int defaultReward;
    
    double alpha;
    double gamma;
    double epsilon;
    
    int episodes;
    int plays;
    boolean running;
    int curEpisode, curPlay;
    int gamesWon, gamesLost;
    int gamesPlayed;
    int dealerLimit;
    
    int computerState, dealerState;
    int dealerScore, computerScore;
    int dealerCards[], computerCards[];
    int dealerCardNumber, computerCardNumber;
    int dealerReward, computerReward;
    
    Thread thisThread;
    
    /////////////////////////////////////////////////////////
    // Constructor
    /////////////////////////////////////////////////////////

    public BRL(BRLPanel parent, int episodes, int plays) {
	this.parent = parent;
	
    	learningType = SARSA;
    	actionSelect = E_GREEDY;
    	
    	numberOfStates = 32;
    	Q = new BStrategy(32); 
    	Q.clear();
    	cards = new int [52];	
    	playedCards = new boolean [52];
    	computerCards = new int [10];
    	dealerCards = new int [10];
    	dealerLimit = 17;
	
    	alpha = 0.01;
    	gamma = 0.9;
    	epsilon = 0.01;
    	
	goodReward = 0;
	badReward = -1;
	defaultReward = 0;
	
    	this.episodes = episodes;
    	this.plays = plays;
	running = false;
	
	graph = new BGraph2D(episodes);
	graph.setLineColor(Color.red);
	graph.setAxesColor(Color.blue);
	graph.setBackground(BRLPanel.lightbc);
	graph.setXLabel("Episode");
	graph.setYLabel("% win games");
	graph.setBounds(5, 5, 480, 300);
	
	thisThread = new Thread(this);
	
	// this allows to repaint canvases without problems during learning
	thisThread.setPriority(Thread.MIN_PRIORITY);
    }
    
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // Reinforcement Learning
    //////////////////////////////////////////////////////////////////////////////////////////////////
    
     
    //////////////////////////////////////////////////////
    // Method		:	start   
    // Descrition	:   start the learning
    /////////////////////////////////////////////////////////
 
    public synchronized void start() {
    	// initialisation of the variables, the graph
    	parent.initBRLVariables();
	parent.setGraph(graph);
	curEpisode = 0;
    	curPlay = 0;
	running = true;
	thisThread.start();
	Q.setName("Current");
	BPlayers.player[0].setCurrentStrategy(Q);
	BPlayers.player[1].setCurrentStrategy(Q);
    }
    
    
    //////////////////////////////////////////////////////
    // Method		:	suspend 
    // Descrition	:   suspend the current thread
    /////////////////////////////////////////////////////////

    public void suspend() {
	if (running) {
	    threadSuspended = true;	   
	}	  
    }
    
    
    //////////////////////////////////////////////////////
    // Method		:	resume   
    // Descrition	:   continue a suspended thread
    /////////////////////////////////////////////////////////
 
    public synchronized void resume() {
	if (running) {
	    threadSuspended = false;	   
	    notify();
	}
    }
        
    //////////////////////////////////////////////////////
    // Method		:	stop   
    // Descrition	:   stop the current thread
    /////////////////////////////////////////////////////////

    public synchronized void stop() {
	running = false;   
	Q.setName("SARSA");
	BPlayers.player[0].addStrategy(Q);
	BPlayers.player[1].addStrategy(Q);
    }
    
    
    //////////////////////////////////////////////////////
    // Method		:	run   
    // Descrition	:   learning is performed here
    //////////////////////////////////////////////////////
 
    public void run() {
    
    	int win, both, lose, cnt;
    	int i, j, k;
    	int a, s, next_a, next_s, r;
	int last_a=0, last_s=0, last_next_a=0, last_next_s=0, last_r=0;
	int newCard;
	boolean wait;
		
    	initEstimateFunction();		// initialise les valeurs  0.0 (arbitraire)
    	
    	for (i=0; i<episodes; i++) {	   
	
	    if (!running) break;

	    try {
                if (threadSuspended) {
                    synchronized(this) {
                        while (threadSuspended)
                            wait();
                    }
                }
            } 
	    catch (InterruptedException e){
		e.printStackTrace();
	    }
	    
	    curEpisode = i;
	    win = 0;
	    both = 0;
	    lose = 0;
	    cnt = 0;
			
	    for (j=0; j<plays; j++) {
		if (!running) break;
		
		curPlay = j;
				
		for (k=0; k<52; k++)	// aucune carte n'a t jouee
		    playedCards[k] = false;
					
		for (k=0; k<10; k++) {	// aucun joueur n'a de carte
		    dealerCards[k] = 0;
		    computerCards[k] = 0;
		}
				
		// Les scores  0
		computerScore = 0;
		dealerScore = 0;
		computerReward = 0;
		dealerReward = 0;
		
		wait = false;
		
		// Distribuer les cartes au joueur et au distributeur et
		// Dterminer l'tat initial en fonction des ces deux cartes
		computerState = initState(COMPUTER);
		dealerState = initState(DEALER);
		computerCardNumber = 2;
		dealerCardNumber = 2;
		
		if (computerState == -1) 
		    computerReward = goodReward;
		
		// Laisser l'algo jouer
		while (computerState != -1) {
		    a = selectAction(computerState);
		    
		    // System.out.println(a);
		    
		    if (a==HIT) {
			newCard = dealCard();
			computerCards[computerCardNumber] = newCard;
			computerCardNumber++;
		    }
		    else    
			newCard = 0;
		    
		    // calc state and reward
		    next_s = nextState(computerState, a, newCard, COMPUTER);
		    computerReward = calcReward(next_s, computerScore);
		    if (next_s > 0)
			next_a = selectAction(next_s);
		    else
			next_a = STAND;
				
		    // if it is the end		
		    if (next_s == -1) {
			last_s = computerState;
			last_a = a;
			last_next_s = next_s;
			last_next_a = next_a;
			last_r = computerReward;
			wait = true;
		    }
		    else {
			updateEstimateFunction(computerState, a, computerReward, next_s, next_a);
		    }
		    
		    computerState = next_s;
		}		
		
		if (dealerState == -1) dealerReward = goodReward;
		
		// let the dealer play...
		if (computerReward == goodReward || computerReward == defaultReward)
		    while (dealerState != -1) {
			a = selectDealerAction();
			if (a == HIT) {
			    newCard = dealCard();
			    dealerCards[dealerCardNumber] = newCard;
			    dealerCardNumber++;
			    dealerState = nextState(dealerState, a, newCard, DEALER);
			    dealerReward = calcReward(dealerState, dealerScore);
			}
			else {
			    dealerState = -1;
			    dealerReward = calcReward(dealerState, dealerScore);
			}
		    }
		else
		    dealerReward = goodReward;
		
		// who wins ?
		if (computerReward > goodReward && computerScore > dealerScore)
		    dealerReward = badReward;
		if (dealerReward == goodReward && dealerScore > computerScore)
		    computerReward = badReward;
		if ((computerReward == goodReward || computerReward == defaultReward) &&
		    (computerScore > dealerScore || dealerReward == badReward)) {
		    win++;
		    last_r = goodReward;		    	
		}
		else if ((dealerReward == goodReward || dealerReward == defaultReward) &&
			 (dealerScore >= computerScore || computerReward == badReward)) {
		    lose++;
		    last_r = badReward;		    	
		}
		else
		    System.out.println(computerScore + "  " + dealerScore + "  " + dealerReward);
				
		if (wait)
		    updateEstimateFunction(last_s, last_a, last_r, last_next_s, last_next_a);
		
		try {
		    thisThread.sleep(2); 
		} catch (Exception e) {System.out.println("Error somewhere");}
		
	    } // end for (j);
	    gamesWon += win;
	    gamesLost += lose;
	    parent.updateInformations();
	    
	    graph.addPoint(curEpisode, (int)((float)win/(float)plays * 100.0));

	    try {
		thisThread.sleep(2); 
	    } catch (Exception e) {System.out.println("Error somewhere");}
	   
	} // end for (i)
	
	stop();
    }
	
	

    //////////////////////////////////////////////////////
    // Method		:	initState   
    // Descrition	:   - distribute the cards
    //					- compute the initial state
    /////////////////////////////////////////////////////////

    private int initState(int who) {
    	int i;
    	int s = 0;
    	int newCard;
    	
    	for (i=0; i<2; i++) {
	    newCard = dealCard();
	    if (who==COMPUTER) 
		computerCards[i] = newCard;
	    else 
		dealerCards[i] = newCard;
	    s = nextState(s, HIT, newCard, who);
	    if (who == COMPUTER)
		computerReward = calcReward(s, computerScore);
	    else
		dealerReward = calcReward(s, dealerScore);
    	}
    	
	return s;
    }



    //////////////////////////////////////////////////////
    // Method		:	calcReward   
    // Descrition	:   compute the reward
    /////////////////////////////////////////////////////////
  	    
    private int calcReward(int state, int score) {
	int r;
		
	if (state==-1)
	    if (score>21)
		r = badReward;
	    else 
		r = goodReward;
	else 
	    r = defaultReward;
	    
	return r;
    }

		

    //////////////////////////////////////////////////////
    // Method		:	selectAction   
    // Descrition	:   select an action following a certain policy
    //						* epsilon-greedy or softmax
    /////////////////////////////////////////////////////////

    private int selectAction(int s) {
        int a;
        
	switch (actionSelect) {
	case SOFTMAX : {
	    double pi[];
	    double sum_exp = 0.0;
	    double sum_p, rnd;
	    int a_t;
	    pi = new double [2];

				// Compute the pi values, i.e probability of taking action a in state s
	    for (a=0; a<2; a++) {
		pi[a] = Math.exp(Q.get(s, a));
		sum_exp += pi[a];
	    }
	    for (a=0; a<2; a++) {
		pi[a] = pi[a] / sum_exp;
	    }
		
	    rnd = Math.random();
	    a_t = 0;
	    sum_p = 0.0;
	    for (a=0; a<2; a++) {
		if (rnd>sum_p && rnd < sum_p+pi[a])
		    a_t = a;
		sum_p += pi[a];
	    }
	    a = a_t;				
		
	    break;
	}
	case E_GREEDY:
	default: {
	    int i;
	    int qmax;
	    if (Q.get(s, HIT)>Q.get(s, STAND))
		qmax = HIT;
	    else if (Q.get(s, HIT)<Q.get(s, STAND))
		qmax = STAND;
	    else
		qmax = (int)(Math.random()*2);  
		    
	    if (Math.random()<epsilon) 
		qmax = (int)(Math.random()*2);
	    a = qmax;
	    break;
	}
	}
	
        return a;
    }
    
    
    //////////////////////////////////////////////////////
    // Method		:	selectDealerAction   
    // Descrition	:   fixed action for the dealer
    /////////////////////////////////////////////////////////

    private int selectDealerAction() {
        int a;
        if(dealerScore<dealerLimit)
	    a = HIT;
	else
	    a = STAND;
        return a;
    }
    
    

    //////////////////////////////////////////////////////
    // Method		:	nextState   
    // Descrition	:   compute the next state
    /////////////////////////////////////////////////////////

    private int nextState(int s, int a, int newCard, int who) {
    
	switch (a) {
	case HIT : {
	    if (newCard==11 && s<22)
		s+=22;
	    else if (newCard==11 && s>21) {
		newCard =1;
		s++;
	    }
	    else if (newCard != 11 && s<21) {
		s+=newCard;
		if (s>21) s = -1;
	    }
	    else if (newCard != 11 && s>21)
		s+=newCard;
	    break;
	}
	case STAND : {
	    s = -1;
	    break;
	}
	}
	
	if (who == COMPUTER) {
	    computerScore += newCard;
	    if (computerScore >21 && s >21) {
		computerScore -= 10;
		s-=21;
	    }
	    if (computerScore > 20)
		s = -1;    
	}
	else {
	    dealerScore += newCard;
	    if (dealerScore >21 && s >21) {
		dealerScore -= 10;
		s-=21;
	    }
	    if (dealerScore > 20)
		s = -1;    
	}
	
	return s;
    }
    
    
    //////////////////////////////////////////////////////
    // Method		:	initEstimateFunction   
    // Descrition	:   clear the table of Q-values
    /////////////////////////////////////////////////////////

    private void initEstimateFunction() {
    	Q.clear();
    }
    
    
    //////////////////////////////////////////////////////
    // Method		:	updateEstimateFunction   
    // Descrition	:   update the Q(s, a) values
    /////////////////////////////////////////////////////////

    private void updateEstimateFunction(int s, int a, int r, int next_s, int next_a) {
    	
    	double newValue;
    	double Q_next;
	
	if (next_s == -1)
	    Q_next = 0.0;
	else 
	    Q_next = gamma*Q.get(next_s, next_a);
	
    	newValue = Q.get(s, a) + alpha*((float)r + Q_next - Q.get(s, a));

	Q.set(s, a, newValue);
    }


    //////////////////////////////////////////////////////
    // Method		:	calcState   
    // Descrition	:   compute the state directly
    /////////////////////////////////////////////////////////

    public static int calcState(int theHand[]) {
    
	int i = 0;
	int s = 0;
	int score = 0;
	
	while(theHand[i] != 0 && i<5) {
	    if (Cards.points[theHand[i]]==11 && s<22)
		s+=22;
	    else if (Cards.points[theHand[i]]==11 && s>21)
		s++;
	    else if (Cards.points[theHand[i]]!=11 && s<21) {
		s+=Cards.points[theHand[i]];
		if (s>21) {
		    s = 0;
		    break;
		}
	    }
	    else if (Cards.points[theHand[i]]!=11 && s>21) {
		s+=Cards.points[theHand[i]];
	    }
	    
	    score += Cards.points[theHand[i]];
	    if (score>21 && s>21) {
		score -= 10;
		s -= 21;
	    }
	    i++;
	}
	System.out.println(s+" "+theHand[0]+" "+theHand[1]+" "+theHand[2]);
	return s;
    }


    //////////////////////////////////////////////////////
    // Method		:	dealCard   
    // Descrition	:   distribute a card
    /////////////////////////////////////////////////////////

    private int dealCard() {
    	int whichCard;
    	do {
	    whichCard = (int) (Math.random() * 52);
    	} while (playedCards[whichCard]);
    	playedCards[whichCard] = true;
    	return Cards.points[whichCard];
    }
    
    
    //////////////////////////////////////////////////////
    // get and set methods for class variables   
    /////////////////////////////////////////////////////////

    // strategy
    public BStrategy getStrategy() {
	return this.Q;
    }
    
    // alpha value
    public void setAlpha(double a) {
	if (a>=0 && a<1)
	    alpha = a;
    }
    
    public double getAlpha() {
	return alpha;
    }
    
    // gamma value
    public void setGamma(double g) {
	if (g>0 && g<1)
	    gamma = g;
    }	
    	
    public double getGamma() {
	return gamma;
    }
    
    // epsilon value
    public void setEpsilon(double e) {
	if (e>0 && e<1)
	    epsilon = e;
    }
    
    public double getEpsilon() {
	return epsilon;
    }
    
    // dealer limit
    public void setDealerLimit(int d) {
	if (d>=0 && d<=21)
	    dealerLimit = d;
    }
    
    public int getDealerLimit() {
	return dealerLimit;
    }
    
    // good, bad, default reward...
    public void setGoodReward(int g) {
	goodReward = g;
    }
    public void setBadReward(int b) {
	badReward = b;
    } 
    public void setDefaultReward(int d) {
	defaultReward = d;
    }
    public int getGoodReward() {
	return goodReward;
    }
    public int getBadReward() {
	return badReward;
    }
    public int getDefaultReward() {
	return defaultReward;
    }
    
    // number of episodes and plays
    public void setEpisodes(int e) {
	episodes = e;
    }
    public int getEpisodes() {
	return episodes;
    }
    public void setPlays(int p) {
	plays = p;
    }
    public int getPlays () {
	return plays;
    }
    
    // General informations
    public int getWonGames() {
	return gamesWon;
    }
    public int  getLostGames() {
	return gamesLost;
    }
    public int getEpisode() {
	return curEpisode;
    }
    
    // Type of action selection
    public void setActionSelection(int as) {
    
	switch (as) {
	case SOFTMAX : {
	    actionSelect = SOFTMAX;
	    break;
	}
	case E_GREEDY : 
	default : {
	    actionSelect = E_GREEDY;
	}
	}
    }
    
    // is the thread running??
    public boolean isRunning() {
	return running;
    }
    
    public BGraph2D getGraph() {
	return graph;
    }
}
