#include <stdio.h>
#include <math.h>

/*
 * Fixed point implementation of Newton-Raphson (NR) square root approximation:
 *
 * The concept:
 * ------------
 *
 * Newton-Raphson approximation follows an iterative approach to finding 
 * numerical solutions to a specified equation. It has the property of quadratic
 * convergence which means the number of correct bits double every iteration.
 *
 * However, convergence doesn't just take place unconditionally. The initial
 * guess should be reasonably close to the final answer. Otherwise, it will 
 * diverge and take off for Pluto. Typically, the initial guess can be off by
 * slightly more than a factor of two. 
 *
 * We encourage the interested reader to read "Numerical Recipies in C" or surf
 * to the Wikipedia page for "Newton's method". These sources contain alot of
 * background information and also pictures (very important!).
 *
 * The basic NR work-horse is the following recursive equation:
 *
 *                  f(x[n])
 * x[n+1] = x[n] - --------                 (1)
 *                 f'(x[n])
 *
 * where f() denotes the function we want to solve and f'() denotes the first 
 * derivative of this function.
 *
 * Now we get to the square root part... We just have to find f(). The 
 * straightforward approach would be:
 *
 * x^2 - a = 0                              (2)
 *
 * where 'a' denotes our input. it doesn't look too complicated, right? however,
 * when applying it to our iteration (1) it results in a division we have to 
 * perform and basically can't get around:
 *
 * x[n+1] = x[n] - 0.5 (x[n] - a/x[n])      (3)
 *
 * As you can see the a/x[n] term contains the division,
 *
 * A division remains a costly operation on processors even today. If we can 
 * replace it with a multiple-with-inverse or even a right shift we will do it.
 * The following alternative to f() is the following: 
 *
 *  1     1
 * --- - --- = 0                            (4)
 * x^2    a
 *
 * Which, of course, also has the answer x = sqrt(a). If we substitute this in
 * the NR iteration (as seen in equation 1) we get the following result:
 *
 * x[n+1] = x[n](1.5 - 0.5*inv*x[n]*x[n])   (5)
 *
 * where inv = 1/a
 *
 * Ah, now we have a lot multiplies instead of one division. This is ideal on
 * processors with a fast multiply (DSPs and many CPU's such as 68060, Power PC
 * or Pentium).
 *
 * The first guess:
 * ----------------
 *
 * Now we still have the problem of the first guess to attend to. A reliable way
 * to get the first guess is to use the half of the exponent (power of two) in 
 * order 'shift' the input close to the answer.
 *
 * But computing such an exponent is not always trivial. If we got a floating 
 * point input and we know the exact location of the exponent bit field then we
 * are really close already to the answer.
 *
 * The IEEE floating point format looks like this where 's' denotes sign, 'f'
 * denotes the fraction and 'e' denotes the exponent we need. Take a look at the
 * ASCII art below:
 *
 * +-----------------+
 * |s|eee..|ffff.....|
 * +-----------------+
 *
 * It seems the extraction, halving and insertion of the exponent field is a 
 * trivial process with a floating point number.
 *
 * However, if we got integer input we still need to check the position of the
 * highest '1' in the word. This is less trivial and may be solved by a search
 * algorithm. A binary search may be used to limit the time complexity to 
 * O(log(n)). On small words even a linear search possibly followed by a table
 * look-up will be very effective.
 *
 * Implementation issues:
 * ----------------------
 *
 * The iteration we want to used, as listed in equation 5, features many 
 * multiplies. Clearly this does not benefit the processors with a slow
 * multiplier. In this case we can better stick equation 3.
 *
 * If we do have a fast multiplier then equation 5 is really the best. This
 * being said, a multiply-with-inverse operation also requires some caution.
 * Notably in the fixed point case..
 *
 * Often, a 16 bit fraction for multiplication is not enough, certainly not
 * when the input is large. Computing a reliable answer for an input of say
 * 10000 would be impossible. For this we need 24 bit precision at least.
 * Luckily, most DSPs feature this precision and most modern CPU's do as well.
 *
 * The implentations below are:
 * 'nr_sqrt()'   double precision floating point, no architecture specific 
 *               optimisations
 * 'nr_sqrt_i()' 32 bit integer, uses 64 bit internally for multiplication
 *               guard bits.. multiplication with inverse has two distinct
 *               cases: a<2^20 and a>=2^20. in can be implemented with the
 *               68020+ mulu.l (32x32->64) and roxr.l..
 *
 */


double nr_sqrt(double a) 
{
  double res;
  double inv=1.0/a;
  int i, exp;

  /* first guess, it's good we already got the exponent.. */
  frexp(a, &exp);
  printf("exp=%d\n", exp);
  res = (exp<0) ? (1<<(-exp>>1))*a : a/( (double) (1<<(exp>>1))) ;

  for (i=0; i<5; i++) {
    printf("a=%f, res=%f, inv=%f\n", a, res, inv);
    res = res * ( 1.5 - 0.5*inv*res*res );
  }

  return res;
}

unsigned long mask_table[5] =
{
  0x0000FFFF,0x000000FF,0x0000000F,0x00000003,0x00000001
};

double nr_sqrt_i2(unsigned long la) 
{
  int i, hi, lo;
  long long lres, tmp, linv;

  /*
   * first guess: compute the power-of-two exponent by means of a search for the highest '1'
   * this is implementation is based on a binary search.
   */
  for (i=0, lo=0, hi=32; i<5; i++) {
    int avg=(hi+lo)>>1;
    unsigned long tmpmsk = mask_table[i]<<avg;
    unsigned long masked = la & tmpmsk;
    if (masked)
      lo=avg;
    else
      hi=avg;
    printf("tmpmsk=%lX, masked=%lX, lo=%d, hi=%d\n", tmpmsk, masked, lo, hi);
  }
  lres = (hi<0) ? la<<(-hi>>1) : la>>(hi>>1);

  if (hi<20) {

    /* nice for integer precision output when input has range (1<la<1,000,000) */
    linv = 0x100000000LL/la;
    for (i=0; i<5; i++) {
      printf("res\t\t\t= 0x%08llX\n", lres);
      tmp=lres*lres;
      printf("res^2\t\t\t= 0x%08llX\n", tmp);
      tmp*=linv;

      /* shift and round, in asm this is way faster.. */
      //tmp>>=16;
      //if (tmp&1)
      //  tmp+=2;
      //tmp>>=1;
      tmp>>=17;

      printf("res^2/la\t\t= 0x%08llX\n", tmp);

      tmp=0x00018000-tmp;
      lres*=tmp;

      /* shift and round, in asm this is way faster.. */
      lres>>=15;
      if (lres&1)
      lres+=2;
      lres>>=1;
    }

  } else {

    /* nice for integer precision output when input has range (1,000,000<la<4,000,000,000) */
    linv = 0x1000000000000LL/la;
    for (i=0; i<5; i++) {
      printf("res\t\t\t= 0x%08llX\n", lres);
      tmp=lres*lres;
      printf("res^2\t\t\t= 0x%08llX\n", tmp);
      tmp*=linv;

      /* shift and round, in asm this is way faster.. */
      //tmp>>=16;
      //if (tmp&1)
      //  tmp+=2;
      //tmp>>=1;
      tmp>>=33;

      printf("res^2/la\t\t= 0x%08llX\n", tmp);

      tmp=0x00018000-tmp;
      lres*=tmp;

      /* shift and round, in asm this is way faster.. */
      lres>>=15;
      if (lres&1)
      lres+=2;
      lres>>=1;
    }


  }

  return lres;
}

int main(void) 
{
  double in=2000000000;
  long lin = (unsigned long) (in);
  unsigned long lout = nr_sqrt_i2(lin);
  double out = (double) lout;

  //printf("sqrt(%f)=%f\n", in, nr_sqrt(in));
  printf("sqrt(%f)=%f\n", in, out) ;
  
  return 0;
}
