/*         ______   ___    ___ 
 *        /\  _  \ /\_ \  /\_ \ 
 *        \ \ \L\ \\//\ \ \//\ \      __     __   _ __   ___ 
 *         \ \  __ \ \ \ \  \ \ \   /'__`\ /'_ `\/\`'__\/ __`\
 *          \ \ \/\ \ \_\ \_ \_\ \_/\  __//\ \L\ \ \ \//\ \L\ \
 *           \ \_\ \_\/\____\/\____\ \____\ \____ \ \_\\ \____/
 *            \/_/\/_/\/____/\/____/\/____/\/___L\ \/_/ \/___/
 *                                           /\____/
 *                                           \_/__/
 *
 *      Math routines, compiled sprite wrapper, etc.
 *
 *      By Shawn Hargreaves.
 *
 *      fsqrt() and fhypot() routines by David Kuhling.
 *
 *      See readme.txt for copyright information.
 */


#include "asmdefs.inc"

.text



/* empty bank switch routine for the standard VGA mode and memory bitmaps */
FUNC(_stub_bank_switch)
   movl BMP_LINE(%edx, %eax, 4), %eax
   ret

FUNC(_stub_unbank_switch)
   ret

FUNC(_stub_bank_switch_end)
   ret




/* void apply_matrix_f(MATRIX_f *m, float x, float y, float z, 
 *                                  float *xout, float *yout, float *zout);
 *  Floating point vector by matrix multiplication routine.
 */
FUNC(apply_matrix_f)

   #define MTX    ARG1
   #define X      ARG2
   #define Y      ARG3
   #define Z      ARG4
   #define XOUT   ARG5
   #define YOUT   ARG6
   #define ZOUT   ARG7

   pushl %ebp
   movl %esp, %ebp
   pushl %ebx

   movl MTX, %edx 
   movl XOUT, %eax 
   movl YOUT, %ebx 
   movl ZOUT, %ecx 

   flds  M_V00(%edx) 
   fmuls X 
   flds  M_V01(%edx) 
   fmuls Y 
   flds  M_V02(%edx) 
   fmuls Z 
   fxch  %st(2) 

   faddp %st(0), %st(1) 
   flds  M_V10(%edx) 
   fxch  %st(2) 

   faddp %st(0), %st(1) 
   fxch  %st(1) 

   fmuls X 
   fxch  %st(1) 

   fadds M_T0(%edx) 
   flds  M_V11(%edx) 

   fmuls Y 
   flds  M_V12(%edx) 

   fmuls Z 
   fxch  %st(1) 

   faddp %st(0), %st(3) 
   flds  M_V20(%edx) 
   fxch  %st(3) 

   faddp %st(0), %st(1) 
   fxch  %st(2) 

   fmuls X 
   fxch  %st(2) 

   fadds M_T1(%edx) 
   flds  M_V21(%edx) 

   fmuls Y 
   flds  M_V22(%edx) 

   fmuls Z 
   fxch  %st(4) 

   faddp %st(0), %st(1) 
   fxch  %st(1) 
   fstps (%ebx) 

   faddp %st(0), %st(2) 
   fstps (%eax) 

   fadds M_T2(%edx) 
   fstps (%ecx)

   popl %ebx
   movl %ebp, %esp
   popl %ebp
   ret                              /* end of apply_matrix_f() */




#undef X
#undef Y




/* void draw_compiled_sprite(BITMAP *bmp, COMPILED_SPRITE *sprite, int x, y)
 *  Draws a compiled sprite onto the specified bitmap at the specified
 *  position, _ignoring_ clipping. The bitmap must be in the same format
 *  that the sprite was compiled for.
 */
FUNC(draw_compiled_sprite)

   #define BMP       ARG1
   #define SPRITE    ARG2
   #define X         ARG3
   #define Y         ARG4

   pushl %ebp
   movl %esp, %ebp
   subl $4, %esp                 /* 1 local variable: */

   #define PLANE     -4(%ebp)

   pushl %ebx
   pushl %esi
   pushl %edi

   movl BMP, %edx                /* bitmap pointer in edx */
 #ifdef USE_FS
   movw BMP_SEG(%edx), %fs       /* load segment selector into fs */
 #endif

   movl SPRITE, %ebx
   cmpw $0, CMP_PLANAR(%ebx)     /* is the sprite planar or linear? */
   je linear_compiled_sprite

   movl X, %ecx                  /* get write plane mask in bx */
   andb $3, %cl
   movl $0x1102, %ebx
   shlb %cl, %bh

   movl BMP_LINE+4(%edx), %ecx   /* get line width in ecx */
   subl BMP_LINE(%edx), %ecx

   movl X, %esi                  /* get destination address in edi */
   shrl $2, %esi
   movl Y, %edi
   movl BMP_LINE(%edx, %edi, 4), %edi
   addl %esi, %edi

   movl $0x3C4, %edx             /* port address in dx */

   movl $0, PLANE                /* zero the plane counter */

   _align_
planar_compiled_sprite_loop:
   movl %ebx, %eax               /* set the write plane */
   outw %ax, %dx 

   movl %edi, %eax               /* get address in eax */

   movl PLANE, %esi              /* get the drawer function in esi */
   shll $3, %esi
   addl SPRITE, %esi
   movl CMP_DRAW(%esi), %esi

   call *%esi                    /* and draw the plane! */

   incl PLANE                    /* next plane */
   cmpl $4, PLANE
   jge draw_compiled_sprite_done

   rolb $1, %bh                  /* advance the plane position */
   adcl $0, %edi
   jmp planar_compiled_sprite_loop

   _align_
linear_compiled_sprite:
   movl X, %eax
   movzwl CMP_COLOR_DEPTH(%ebx), %ecx
   cmpl $24, %ecx
   jne normal_linear_compiled_sprite
   leal (%eax, %eax, 2), %eax
   jmp end24bpp_linear_compiled_sprite

   _align_
normal_linear_compiled_sprite:
   addl $7, %ecx
   shrl $4, %ecx
   shll %cl, %eax

end24bpp_linear_compiled_sprite:
   movl %eax, %ecx               /* x coordinate in ecx */
   movl Y, %edi                  /* y coordinate in edi */
   movl BMP_WBANK(%edx), %esi    /* bank switch function in esi */
   movl CMP_DRAW(%ebx), %ebx     /* drawer function in ebx */

   call *%ebx                    /* and draw it! */

draw_compiled_sprite_done:
   movl BMP, %edx
   UNWRITE_BANK()

   popl %edi
   popl %esi
   popl %ebx
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of draw_compiled_sprite() */




/* void _do_stretch(BITMAP *source, BITMAP *dest, void *drawer, 
 *                  int sx, fixed sy, fixed syd, int dx, int dy, int dh, 
 *                  int color_depth);
 *
 *  Helper function for stretch_blit(), calls the compiled line drawer.
 */
FUNC(_do_stretch)

   #define SOURCE       ARG1
   #define DEST         ARG2
   #define DRAWER       ARG3
   #define SX           ARG4
   #define SY           ARG5
   #define SYD          ARG6
   #define DX           ARG7
   #define DY           ARG8
   #define DH           ARG9
   #define COL_DEPTH    ARG10

   pushl %ebp
   movl %esp, %ebp
   pushw %es
   pushl %edi
   pushl %esi
   pushl %ebx

   movl DEST, %edx
   movw BMP_SEG(%edx), %es       /* load destination segment */
   movl DRAWER, %ebx             /* the actual line drawer */

   movl BMP_ID(%edx), %eax
   testl $BMP_ID_PLANAR, %eax
   jnz stretch_modex_loop
   movl COL_DEPTH, %eax
   cmpl $8, %eax
   je stretch_normal_loop
   cmpl $15, %eax
   je stretch_bpp_16
   cmpl $16, %eax
   je stretch_bpp_16
   cmpl $24, %eax
   je stretch_bpp_24
   cmpl $32, %eax
   je stretch_bpp_32
   jmp stretch_done


   /* special loop for 24 bit */
   _align_
stretch_bpp_24:
   movl SX, %eax
   leal (%eax, %eax, 2), %eax
   movl %eax, SX
   movl DX, %eax
   leal (%eax, %eax, 2), %eax
   movl %eax, DX

   _align_
stretch_loop24:
   movl SOURCE, %edx             /* get source line (in esi) and bank */
   movl SY, %eax
   shrl $16, %eax
   READ_BANK()
   movl %eax, %esi
   addl SX, %esi

   movl DEST, %edx               /* get dest line (in edi) and bank */
   movl DY, %eax
   WRITE_BANK()
   movl %eax, %edi
   addl DX, %edi
   pushl %edx
   pushl %ebx

   call *%ebx                    /* draw (clobbers eax, ebx, ecx, edx) */

   popl %ebx
   popl %edx
   movl SYD, %eax                /* next line in source bitmap */
   addl %eax, SY
   incl DY                       /* next line in dest bitmap */
   decl DH
   jg stretch_loop24
   jmp stretch_done


   /* special loop for mode-X */
   _align_
stretch_modex_loop:
   movl SOURCE, %edx             /* get source line (in esi) and bank */
   movl SY, %eax
   shrl $16, %eax
   movl BMP_LINE(%edx, %eax, 4), %esi
   addl SX, %esi

   movl DEST, %edx               /* get dest line (in edi) and bank */
   movl DY, %eax
   movl BMP_LINE(%edx, %eax, 4), %edi
   addl DX, %edi

   call *%ebx                    /* draw the line (clobbers eax and ecx) */

   movl SYD, %eax                /* next line in source bitmap */
   addl %eax, SY
   incl DY                       /* next line in dest bitmap */
   decl DH
   jg stretch_modex_loop
   jmp stretch_done


   _align_
stretch_bpp_16:
   shll $1, SX
   shll $1, DX
   jmp stretch_normal_loop

   _align_
stretch_bpp_32:
   shll $2, SX
   shll $2, DX


   /* normal stretching loop */
   _align_
stretch_normal_loop:
   movl SOURCE, %edx             /* get source line (in esi) and bank */
   movl SY, %eax
   shrl $16, %eax
   READ_BANK()
   movl %eax, %esi
   addl SX, %esi

   movl DEST, %edx               /* get dest line (in edi) and bank */
   movl DY, %eax
   WRITE_BANK()
   movl %eax, %edi
   addl DX, %edi

   call *%ebx                    /* draw the line (clobbers eax and ecx) */

   movl SYD, %eax                /* next line in source bitmap */
   addl %eax, SY
   incl DY                       /* next line in dest bitmap */
   decl DH
   jg stretch_normal_loop


stretch_done:
   movl SOURCE, %edx
   UNWRITE_BANK()

   movl DEST, %edx
   UNWRITE_BANK()

   popl %ebx
   popl %esi
   popl %edi
   popw %es
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of _do_stretch() */




/* unsigned long _blender_trans24(unsigned long x, y, n);
 *  24 bit trans blender function. See colblend.c for the others.
 */
FUNC(_blender_trans24)
   pushl %ebp
   movl %esp, %ebp
   pushl %esi
   pushl %ecx
   pushl %ebx

   movl ARG1, %esi
   movl ARG2, %ebx
   movl ARG3, %ecx

   movl %esi, %eax
   movl %ebx, %edx
   andl $0xFF00FF, %eax
   andl $0xFF00FF, %edx

   orl %ecx, %ecx
   jz noinc

   incl %ecx

noinc:
   subl %edx, %eax
   imull %ecx, %eax
   shrl $8, %eax
   addl %ebx, %eax

   andl $0xFF00, %ebx
   andl $0xFF00, %esi

   subl %ebx, %esi
   imull %ecx, %esi
   shrl $8, %esi
   addl %ebx, %esi
   andl $0xFF00FF, %eax
   andl $0xFF00, %esi

   orl %esi, %eax

   popl %ebx
   popl %ecx
   popl %esi
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of _blender_trans24() */




/* fixed fsqrt(fixed x);
 *  Fixed point square root routine. This code is based on the fixfloat
 *  library by Arne Steinarson.
 */
#ifdef ALLEGRO_WATCOM
   FUNC(_fsqrt)                  /* evil hack: fsqrt is a Watcom opcode */
#else
   FUNC(fsqrt)
#endif
   pushl %ebp
   movl %esp, %ebp
   pushl %ecx

   /* This routine uses the following idea:
    *
    *    sqrt (x) = sqrt (x/d) * sqrt(d)
    *    d = 2^(2n)
    *    <=> sqrt (x) = sqrt (x / 2^(2n)) * 2^n
    *
    * i386 ASM bsr instruction is used for getting 2n so that (x/d)
    * falls in the range 0..255. The square root is then calculated using
    * the lookup table fsqrt_table.
    */

   movl ARG1, %eax
   orl %eax, %eax
   jg sqrt_positive
   jz sqrt_zero

   movl GLOBL(allegro_errno), %eax
   movl $ERANGE, (%eax)          /* on overflow, set errno */

sqrt_zero:
   xorl %eax, %eax               /* return zero */
   jmp sqrt_done

   _align_
sqrt_positive:
   shrl $7, %eax                 /* get shift-count 2n by scanning x */
   xorb %cl, %cl                 /* if no bit found, %cl = 2n = 0 */
   bsrl %eax, %ecx
   incb %cl                      /* make it even -->  %cl = 2n */
   andb $0xFE, %cl
   movl ARG1, %eax
   shrl %cl, %eax                /* shift x to fall into range 0..256 */
   movzwl GLOBL(_sqrt_table)(, %eax, 2), %eax
   shrb $1, %cl                  /* %cl = n */
   shll %cl, %eax                /* multiply sqrt(x/2^(2n)) by 2^n */
   shrl $4, %eax                 /* adjust the result */

sqrt_done:
   popl %ecx
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of fsqrt() */




/* fixed fhypot(fixed x, fixed y);
 *  Return fixed point sqrt (x*x+y*y), which is the length of the 
 *  hypotenuse of a right triangle with sides of length x and y, or the 
 *  distance of point (x|y) from the origin. This routine is faster and more 
 *  accurate than using the direct formula fsqrt (fmul (x,x), fmul(y,y)). 
 *  It will also return correct results for x>=256 or y>=256 where fmul(x) 
 *  or fmul(y) would overflow.
 */
FUNC(fhypot)
   pushl %ebp
   movl %esp, %ebp
   pushl %ebx
   pushl %ecx

   /* The idea of this routine is:
    *    sqrt (x^2+y^2) = sqrt ((x/d)^2+(y/d)^2) * d
    *    d = 2^n
    * d has to be chosen so that (x/d)^2 doesnt overflow. Since x and y
    * are fixed point numbers, they are multiplied in the following way:
    *    x^2 = (x*x)/2^16
    * so we come to the formula:
    *    sqrt(x^2+y^2) = sqrt((x*x + y*y)/2^(16+2n)) * 2^n
    * and this is almost the same problem as calculating the square root in
    * sqrt: To find the n that results in (x*x+y*y)/2^(16+2n) being
    * in the range 0..255 so that we can use the lookup table.
    */

   movl ARG1, %eax               /* edx:eax = x*x */
   imull %eax 
   movl %eax, %ebx               /* ecx:ebx = x*x */
   movl  %edx, %ecx 
   movl ARG2, %eax               /* edx:eax = y*y */
   imull %eax 
   addl %ebx, %eax               /* edx:eax = x*x + y*y */
   adcl %ecx, %edx 
   cmpl $0x3FFFFFFF, %edx        /* check for overflow */
   jbe hypot_no_overflow

   movl GLOBL(allegro_errno), %eax
   movl $ERANGE, (%eax)          /* on overflow, set errno */

   movl $0x7FFFFFFF, %eax        /* and return MAXINT */
   jmp hypot_done

   /* And now we're doing a bit-scan to make (x*x+y*y) fall in the range
    * 0..255. Since the intermediate result is 48 bit and we cannot scan
    * more than 32 bit we'll first have to make to reduce the range to 
    * 0..65535 and than reduce to 0..255 in a second step. So the
    * corresponding formula would be:
    *    sqrt(x^2+y^2) = sqrt((x*x + y*y)/2^(16+2*n1+2*n2)) * 2^(n1+n2)
    */ 

   _align_
hypot_no_overflow:
   movb $-1, %cl                 /* %cl = -1 (default if no bit set) */
   bsr %edx, %ecx                /* %cl = 2*n1-1; if bit0 is set have 
				  * to shift by 1 but bsr returns 0 */
   incb %cl                      /* -> adjust %cl */
   incb %cl                      /* %cl = 2*n1: make sure it's even */
   andb $0xFE, %cl 
   shrdl %cl, %edx, %eax         /* eax = (x*x+y*y)/2^(2*n1) */
   shrl $16, %eax                /* eax = (x*x+y*y)/2^(16+2*n1) */
   movb %cl, %bl                 /* bl = 2n1 */
   movl %eax, %edx               /* edx = (x*x+y*y)/2^(16+2*n1) */
   shrl $7, %edx                 /* do another scan on edx to get n2 */
   xorb %cl, %cl 
   bsrw %dx, %cx 
   incb %cl                      /* %cl = 2*n2 -- make sure it's even */
   andb $0xFE, %cl 
   shrl %cl, %eax                /* eax = (x*x+y*y)/2^(16+2*n1+2*n2) */
   movzwl GLOBL(_sqrt_table)(, %eax, 2), %eax
   addb %bl, %cl                 /* %cl = 2*n1+2*n2 */
   xorl %edx, %edx 
   shrb $1, %cl                  /* %cl = n1+n2 */
   shldl %cl, %eax, %edx         /* multiply result with 2^(n1+n2)... */
   shll %cl, %eax 
   shrdl $4, %edx, %eax          /* adjust lookup table value */

hypot_done:
   popl %ecx
   popl %ebx
   movl %ebp, %esp
   popl %ebp
   ret                           /* end of fhypot() */

