
;	optimized mdct() for new GOGO-no-coda (1999/09)
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI

;	1999/09/15	subband.nas̕
;				ϐ̕ύX
;				xxx -> sbd_xxx
;				enwindow_sse -> enwindow
;	2000/03/27	sbd_shiftin̕

%include "nasm.h"

	globaldef	window_filter_subband_SSE
	globaldef	frame_shiftin_multi_SSE
	globaldef	sbd_shiftin_SSE
	globaldef	sbd_shiftin_SSE_MULTI
	externdef	enwindow
	externdef	idct_coefficient
	externdef	sbd_xxx

HAN_SIZE	equ	512		;defined in common.h
SBLIMIT		equ	32
EXTRADELAY	equ	56		;defined in musenc.c

F_SIZE	equ	4

	segment_data
	segment_code

;void frame_shiftin_multi_SSE(int (*int_buf)[1152+576+EXTRADELAY], short *frame_buf, float (*flt_buf)[1152+HAN_SIZE], int mode_gr, int stereo)
		align	16
frame_shiftin_multi_SSE:
		cmp		byte [esp+20],2
		je		near frame_shiftin_stereo_SSE
		jmp		frame_shiftin_mono_SSE

;void frame_shiftin_stereo(int (*int_buf)[1152+576+EXTRADELAY], short *frame_buf, float (*flt_buf)[1152+HAN_SIZE], int mode_gr)
;void frame_shiftin_mono(int (*int_buf)[1152+576+EXTRADELAY], short *frame_buf, float (*flt_buf)[1152+HAN_SIZE], int mode_gr)
;
%if 0
; Ђ^ϊȂRs[邾Ȃ񂾂...
; SonAint_buf̐擪576vfƍŌ56vf͎gȂB
; SoffAint_buf͕svB
;
static short saved_frame[(HAN_SIZE - 32)+(576+EXTRADELAY)][2];
/* EXTRADELAY ́A56B */
/* fast mode */
; ƑȂ͂B
if(mode_gr == 2){
/* source,             destination,... */
saved_frame[ 479:   0] flt_buf[ 552:1663]                        480+576+56 samples
  frame_buf[  39:   0] flt_buf[ 512: 551]                        40(96-56)samples
  frame_buf[ 519:  40] flt_buf[  32: 511] saved_frame[ 479:   0] 480 samples
  frame_buf[1151: 520]                    saved_frame[1111: 480] 56+576 samples
}else{
/* source,             destination,... */
saved_frame[ 575:   0] flt_buf[1088:1663]                        480+96 samples
saved_frame[1055: 576] flt_buf[ 608:1087] saved_frame[ 479:   0] 480 samples
saved_frame[1111:1056]                    saved_frame[ 535: 480] 56 samples
  frame_buf[ 576:   0]                    saved_frame[1111: 536] 576 samples
}

/* psycho acoustic anlysis mode */
; ̂ƂB
if(mode_gr == 2){
/* source,             destination,... */
saved_frame[1055:   0] flt_buf[ 608:1663]                                           480+576 samples
saved_frame[1111:1056] flt_buf[ 552: 607] int_buf[ 631: 576]                        56 samples
  frame_buf[  39:   0] flt_buf[ 512: 551] int_buf[ 671: 632]                        40(96-56)samples
  frame_buf[ 519:  40] flt_buf[  32: 511] int_buf[1151: 672] saved_frame[ 479:   0] 480 samples
  frame_buf[1151: 520]                    int_buf[1783:1152] saved_frame[1111: 480] 56+576 samples
}else{
/* source,             destination,... */
saved_frame[ 575:   0] flt_buf[1088:1663]                                           480+96 samples
saved_frame[1055: 576] flt_buf[ 608:1087]                    saved_frame[ 479:   0] 480 samples
saved_frame[1111:1056]                    int_buf[ 631: 576] saved_frame[ 535: 480] 56 samples
  frame_buf[ 576:   0]                    int_buf[1207: 632] saved_frame[1111: 536] 576 samples
}
%endif

;
;	macros
;
; stereo4	source_adr,int_adr,short_adr,xmm_reg,xmm_reg,xmm_reg,xmm_reg
;			mm0, mm1, mm2, mm3, mm7 = 16
%macro	stereo4	1-7	none,none,none,none,none,none
		movq	mm0,[%1]
		movq	mm1,mm0
%ifnidni	%3,none
		movq	[%3],mm0
%endif
		pslld	mm0,mm7
		psrad	mm0,mm7
%ifnidni	%4,none
		cvtpi2ps	%4,mm0		; L
%endif
		psrad	mm1,mm7
%ifnidni	%2,none
		movq	[%2],mm0
%endif
%ifnidni	%5,none
		cvtpi2ps	%5,mm1		; R
%endif
%ifnidni	%2,none
		movq	[%2+sizeof_int_buf],mm1
%endif
		movq	mm2,[%1+8]
		movq	mm3,mm2
%ifnidni	%3,none
		movq	[%3+8],mm2
%endif
		pslld	mm2,mm7
		psrad	mm2,mm7
%ifnidni	%6,none
		cvtpi2ps	%6,mm2		; L
%endif
		psrad	mm3,mm7
%ifnidni	%2,none
		movq	[%2+8],mm2
%endif
%ifnidni	%7,none
		cvtpi2ps	%7,mm3		; R
%endif
%ifnidni	%2,none
		movq	[%2+8+sizeof_int_buf],mm3
%endif
%endmacro

;
; mono4	source_adr,int_adr,short_adr,xmm_reg,xmm_reg
;			mm0, mm1, mm7 = 16
%macro	mono4	1-5	none,none,none,none,none,none
		movq	mm0,[%1]
		movq	mm1,mm0
%ifnidni	%3,none
		movq	[%3],mm0
%endif
		punpcklwd	mm0,mm0
		psrad	mm0,mm7
%ifnidni	%4,none
		cvtpi2ps	%4,mm0	
%endif
%ifnidni	%2,none
		movq	[%2],mm0
%endif
		punpckhwd	mm1,mm1
		psrad	mm1,mm7
%ifnidni	%5,none
		cvtpi2ps	%5,mm1
%endif
%ifnidni	%2,none
		movq	[%2+8],mm1
%endif
%endmacro

;
; compose2type1 xmm_reg,xmm_reg,xmm_reg
;
; input
;	%1 = [xx, xx, L2, L1]
;	%2 = [xx, xx, L4, L3]
;	%3 = [L7, L6, L5, L8]
; output
;	%1 = [L3, L2, L1, L4]
;	%3 = [L7, L6, L5, L4]
%macro	compose2type1	3
		movlhps	%1,%2
		shufps	%1,%1,10010011B
		movss	%3,%1
%endmacro

;
; compose2type2 xmm_reg,xmm_reg
;
; input
;	%1 = [xx, xx, L1, L0]
;	%2 = [xx, xx, L3, L2]
; output
;	%2 = [L0, L1, L2, L3]
%macro	compose2type2	2
		shufps	%2,%1,00010001B
%endmacro

;
;		2000/04/17	SMPframe_shiftin_stereo
;
; EDX is simply incremented. Source(=ESI) is changed accrding to EDX.
; This routine is split into 4 parts.
%define	omit576
;
;void frame_shiftin_stereo_SSE(int (*int_buf)[1152+576+EXTRADELAY], short *frame_buf, float (*flt_buf)[1152+HAN_SIZE], int mode_gr)
%define	saved_frame	sbd_xxx
%define	sizeof_flt_buf	(1152+HAN_SIZE)*4
%define	sizeof_int_buf	(1152+576+EXTRADELAY)*4
		align	16
frame_shiftin_stereo_SSE:
		push	esi
		push	edi
		push	ebx
		push	ebp
%define	_P	4*4
		mov		eax,16
		movd	mm7,eax				; mm7 is used in stereo4 macro
		xor		edx,edx

;;
%ifndef	omit576
; saved_frame[ 479:   0] flt_buf[1184:1663] 480 samples.
.part1:
		mov		ebx,[esp+_P+12]		; = flt_buf
		mov		ecx,480
		add		ebx,(1663-31-480)*4	; = &flt_buf[1663-31-480]
		mov		esi,saved_frame
		jmp		short .part1.f0
%else
.part1:

		mov		eax,[esp+_P+16]		; = mode_gr
		mov		ebx,[esp+_P+12]		; = flt_buf
		cmp		al,2
		mov		esi,saved_frame
		je		.part1.gr2
; saved_frame[ 479:   0] flt_buf[1184:1663] 480 samples.
.part1.gr1:
		mov		ecx,480
		add		ebx,(1663-31-480)*4	; = &flt_buf[1663-31-480]
		jmp		short .part1.f0

; saved_frame[ 479:   0] flt_buf[1184:1663] 480 samples.
; saved_frame[1055: 480] flt_buf[ 608:1183] 576 samples
.part1.gr2:
		mov		ecx,480+576
		add		ebx,(1663-31-480-576)*4	; = &flt_buf[1663-31-480-576]
		jmp		short .part1.f0
%endif
;saved_frame[ 479:   0] flt_buf[1184:1663]                                           480 samples
; 1st part:
;   index for flt_buf is decremented.
;       input:
;           ecx = 480
;           edx = 0
;           ebx = &flt_buf[1663-31-480]
;           esi = &saved_frame[0]
;       output:
;           ecx = 0
;           edx = 480

		align	16
.part1.lp0:
.part1.f0:
		stereo4	esi+edx*4+12*4,none,none,xmm0,xmm1,xmm2,xmm3
		compose2type1	xmm0,xmm2,xmm6	; get L[31,30,29,16],L[xx,xx,xx,16]
		compose2type1	xmm1,xmm3,xmm7	; get R[31,30,29,16],R[xx,xx,xx,16]

		stereo4	esi+edx*4+ 8*4,none,none,xmm4,xmm5,xmm2,xmm3
		compose2type1	xmm4,xmm2,xmm0	; get L[27,26,25,28],L[31,30,29,28]
		movaps	[ebx+ecx*4+28*4],xmm0
		compose2type1	xmm5,xmm3,xmm1	; get R[27,26,25,28],R[31,30,29,28]
		movaps	[ebx+ecx*4+28*4+sizeof_flt_buf],xmm1

		stereo4	esi+edx*4+ 4*4,none,none,xmm0,xmm1,xmm2,xmm3
		compose2type1	xmm0,xmm2,xmm4	; get L[23,22,21,24],L[27,26,25,24]
		movaps	[ebx+ecx*4+24*4],xmm4
		compose2type1	xmm1,xmm3,xmm5	; get R[23,22,21,24],R[27,26,25,24]
		movaps	[ebx+ecx*4+24*4+sizeof_flt_buf],xmm5

		stereo4	esi+edx*4+ 0*4,none,none,xmm4,xmm5,xmm2,xmm3
		compose2type1	xmm4,xmm2,xmm0	; get L[19,18,17,20],L[23,22,21,20]
		movaps	[ebx+ecx*4+20*4],xmm0
		compose2type1	xmm5,xmm3,xmm1	; get R[19,18,17,20],R[23,22,21,20]
		movaps	[ebx+ecx*4+20*4+sizeof_flt_buf],xmm1

		movss	xmm4,xmm6				; get L[19,18,17,16]
		movaps	[ebx+ecx*4+16*4],xmm4
		movss	xmm5,xmm7				; get R[19,18,17,16]
		movaps	[ebx+ecx*4+16*4+sizeof_flt_buf],xmm5

		stereo4	esi+edx*4+16*4,none,none,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+12*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+12*4+sizeof_flt_buf],xmm3

		stereo4	esi+edx*4+20*4,none,none,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+8*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+8*4+sizeof_flt_buf],xmm3

		stereo4	esi+edx*4+24*4,none,none,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+4*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+4*4+sizeof_flt_buf],xmm3

		stereo4	esi+edx*4+28*4,none,none,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+0*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+0*4+sizeof_flt_buf],xmm3

		add		edx,32
		sub		ecx,32
		jnz		near .part1.lp0

;;
%ifndef	omit576
.part2:
		mov		ebp,[esp+_P+8]		; = frame_buf
		mov		edi,[esp+_P+4]		; = int_buf
		mov		eax,[esp+_P+16]		; = mode_gr
		sub		ebp,1112*4			; = &frame_buf[-1112]
		sub		edi,480*4			; = &int_buf[-480]
		cmp		al,2
		jne		.part2.gr1
; granule 2
;saved_frame[1055: 480] flt_buf[ 608:1183] int_buf[ 575:   0] 576 samples
;saved_frame[1111:1056] flt_buf[ 552: 607] int_buf[ 631: 576] 56 samples
;  frame_buf[  39:   0] flt_buf[ 512: 551] int_buf[ 671: 632] 40(96-56)samples
.part2.gr2:
		sub		ebx,672*4			; = &flt_buf[1152-672]
		mov		ecx,672
		jmp		short .part2.f0

; granule 1
;saved_frame[ 575: 480] flt_buf[1088:1183] int_buf[  95:   0] 96 samples
.part2.gr1
		sub		ebx,96*4			; = &flt_buf[1152-96]
		mov		ecx,96
		jmp		short .part2.f0
%else
.part2:
		mov		ebp,[esp+_P+8]		; = frame_buf
		mov		edi,[esp+_P+4]		; = int_buf
		sub		ebp,1112*4			; = &frame_buf[-1112]
		sub		edi,480*4			; = &int_buf[-480]
		sub		ebx,96*4			; = &flt_buf[1152-576-96] or &flt_buf[1152-96]
		mov		ecx,96
		jmp		short .part2.f0
; granule 2
;saved_frame[1111:1056] flt_buf[ 552: 607] int_buf[ 631: 576] 56 samples
;  frame_buf[  39:   0] flt_buf[ 512: 551] int_buf[ 671: 632] 40(96-56)samples

; granule 1
;saved_frame[ 575: 480] flt_buf[1088:1183] int_buf[  95:   0] 96 samples
%endif

; 2nd part:
;       input:    (mode_gr == 1)           (mode_gr == 2)
;           ecx = 96                       672(576+96)
;           edx = 480                      480
;           ebx = &flt_buf[1152-96]        &flt_buf[1152-672]
;           edi = &int_buf[-480]           &int_buf[-480]
;           esi = &saved_frame[0]          &saved_frame[0]
;           ebp =                          &frame_buf[-1112]
;       output:
;           ecx = 0                        0
;           edx = 480+96                   480+672

		align	16
.part2.lp0:
.part2.f0:
		stereo4	esi+edx*4+12*4,edi+edx*4+12*4,none,xmm0,xmm1,xmm2,xmm3
		compose2type1	xmm0,xmm2,xmm6	; get L[31,30,29,16],L[xx,xx,xx,16]
		compose2type1	xmm1,xmm3,xmm7	; get R[31,30,29,16],R[xx,xx,xx,16]

		stereo4	esi+edx*4+ 8*4,edi+edx*4+ 8*4,none,xmm4,xmm5,xmm2,xmm3
		compose2type1	xmm4,xmm2,xmm0	; get L[27,26,25,28],L[31,30,29,28]
		movaps	[ebx+ecx*4+28*4],xmm0
		compose2type1	xmm5,xmm3,xmm1	; get R[27,26,25,28],R[31,30,29,28]
		movaps	[ebx+ecx*4+28*4+sizeof_flt_buf],xmm1

		stereo4	esi+edx*4+ 4*4,edi+edx*4+ 4*4,none,xmm0,xmm1,xmm2,xmm3
		compose2type1	xmm0,xmm2,xmm4	; get L[23,22,21,24],L[27,26,25,24]
		movaps	[ebx+ecx*4+24*4],xmm4
		compose2type1	xmm1,xmm3,xmm5	; get R[23,22,21,24],R[27,26,25,24]
		movaps	[ebx+ecx*4+24*4+sizeof_flt_buf],xmm5

		stereo4	esi+edx*4+ 0*4,edi+edx*4+ 0*4,none,xmm4,xmm5,xmm2,xmm3
		compose2type1	xmm4,xmm2,xmm0	; get L[19,18,17,20],L[23,22,21,20]
		movaps	[ebx+ecx*4+20*4],xmm0
		compose2type1	xmm5,xmm3,xmm1	; get R[19,18,17,20],R[23,22,21,20]
		movaps	[ebx+ecx*4+20*4+sizeof_flt_buf],xmm1

		movss	xmm4,xmm6				; get L[19,18,17,16]
		movaps	[ebx+ecx*4+16*4],xmm4
		movss	xmm5,xmm7				; get R[19,18,17,16]
		movaps	[ebx+ecx*4+16*4+sizeof_flt_buf],xmm5

		stereo4	esi+edx*4+16*4,edi+edx*4+16*4,none,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+12*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+12*4+sizeof_flt_buf],xmm3

		stereo4	esi+edx*4+20*4,edi+edx*4+20*4,none,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+8*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+8*4+sizeof_flt_buf],xmm3

		cmp		edx,480+576+32
		cmovae	esi,ebp

		stereo4	esi+edx*4+24*4,edi+edx*4+24*4,none,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+4*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+4*4+sizeof_flt_buf],xmm3

		stereo4	esi+edx*4+28*4,edi+edx*4+28*4,none,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+0*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+0*4+sizeof_flt_buf],xmm3

		add		edx,32
		sub		ecx,32
		jnz		near .part2.lp0

;;
.part3:
		mov		ecx,480
		mov		ebp,saved_frame
		shl		edx,2
		sub		ebp,edx				; = &saved_frame[-480-96] or &saved_frame[-480-672]
		sub		ebx,480*4				; = &flt_buf[1152-96-480] or &flt_buf[1152-672-480]
		shr		edx,2
		jmp		short .part3.f0
; granule 2
; frame_buf[ 519:  40] flt_buf[  32: 511] int_buf[1151: 672] saved_frame[ 479:   0] 480 samples

; granule 1
; saved_frame[1055: 576] flt_buf[ 608:1087] int_buf[ 575:  96] saved_frame[ 479:   0] 480 samples

; 3rd part:
;       input:    (mode_gr == 1)           (mode_gr == 2)
;           ecx = 480                      480
;           edx = 480+96                   480+672
;           ebx = &flt_buf[1152-96-480]    &flt_buf[1152-672-480]
;           edi = &int_buf[-480]           &int_buf[-480]
;           esi = &saved_frame[0]          &frame_buf[-1112]
;           ebp = &saved_frame[-480-96]    &saved_frame[-480-672]
;       output:
;           ecx = 0                        0
;           edx = 480+96+480               480+672+480
;
		align	16
.part3.lp0:
.part3.f0:

		stereo4	esi+edx*4+12*4,edi+edx*4+12*4,ebp+edx*4+12*4,xmm0,xmm1,xmm2,xmm3
		compose2type1	xmm0,xmm2,xmm6	; get L[31,30,29,16],L[xx,xx,xx,16]
		compose2type1	xmm1,xmm3,xmm7	; get R[31,30,29,16],R[xx,xx,xx,16]

		stereo4	esi+edx*4+ 8*4,edi+edx*4+ 8*4,ebp+edx*4+ 8*4,xmm4,xmm5,xmm2,xmm3
		compose2type1	xmm4,xmm2,xmm0	; get L[27,26,25,28],L[31,30,29,28]
		movaps	[ebx+ecx*4+28*4],xmm0
		compose2type1	xmm5,xmm3,xmm1	; get R[27,26,25,28],R[31,30,29,28]
		movaps	[ebx+ecx*4+28*4+sizeof_flt_buf],xmm1

		stereo4	esi+edx*4+ 4*4,edi+edx*4+ 4*4,ebp+edx*4+ 4*4,xmm0,xmm1,xmm2,xmm3
		compose2type1	xmm0,xmm2,xmm4	; get L[23,22,21,24],L[27,26,25,24]
		movaps	[ebx+ecx*4+24*4],xmm4
		compose2type1	xmm1,xmm3,xmm5	; get R[23,22,21,24],R[27,26,25,24]
		movaps	[ebx+ecx*4+24*4+sizeof_flt_buf],xmm5

		stereo4	esi+edx*4+ 0*4,edi+edx*4+ 0*4,ebp+edx*4+ 0*4,xmm4,xmm5,xmm2,xmm3
		compose2type1	xmm4,xmm2,xmm0	; get L[19,18,17,20],L[23,22,21,20]
		movaps	[ebx+ecx*4+20*4],xmm0
		compose2type1	xmm5,xmm3,xmm1	; get R[19,18,17,20],R[23,22,21,20]
		movaps	[ebx+ecx*4+20*4+sizeof_flt_buf],xmm1

		movss	xmm4,xmm6				; get L[19,18,17,16]
		movaps	[ebx+ecx*4+16*4],xmm4
		movss	xmm5,xmm7				; get R[19,18,17,16]
		movaps	[ebx+ecx*4+16*4+sizeof_flt_buf],xmm5

		stereo4	esi+edx*4+16*4,edi+edx*4+16*4,ebp+edx*4+16*4,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+12*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+12*4+sizeof_flt_buf],xmm3

		stereo4	esi+edx*4+20*4,edi+edx*4+20*4,ebp+edx*4+20*4,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+8*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+8*4+sizeof_flt_buf],xmm3

		stereo4	esi+edx*4+24*4,edi+edx*4+24*4,ebp+edx*4+24*4,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+4*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+4*4+sizeof_flt_buf],xmm3

		stereo4	esi+edx*4+28*4,edi+edx*4+28*4,ebp+edx*4+28*4,xmm0,xmm1,xmm2,xmm3
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+0*4],xmm2
		compose2type2	xmm1,xmm3
		movaps	[ebx+ecx*4+0*4+sizeof_flt_buf],xmm3

		add		edx,32
		sub		ecx,32
		jnz		near .part3.lp0

;;
.part4:
		mov		eax,[esp+_P+16]		; = mode_gr
		cmp		al,2
		je		.part4.gr2
.part4.gr1:
; granule 1
; saved_frame[1111:1056] int_buf[ 631: 576] saved_frame[ 535: 480] 56 samples

; 4th part:
;       input:    (mode_gr == 1)
;           ecx = 56
;           edx = 480+96+480
;           edi = &int_buf[-480]
;           esi = &saved_frame[0]
;           ebp = &saved_frame[-480-96]
;       output:
;           ecx = 0
;           edx = 480+96+480+56

		mov		ecx,56/4
		jmp		short .part4.gr1.f0

		align	16
.part4.gr1.lp0:
.part4.gr1.f0:
		stereo4	esi+edx*4+0,edi+edx*4+0,ebp+edx*4+0	; saved_frame, int_buf, saved_frame
		add		edx,4
		loop	.part4.gr1.lp0
;;
		mov		esi,[esp+_P+8]		; frame_buf
		sub		esi,1112*4
		mov		ecx,576/4
		jmp		short .part4.f0

.part4.gr2:
		mov		ecx,632/4
		jmp		short .part4.f0

; granule 1
;   frame_buf[ 576:   0] int_buf[1207: 632] saved_frame[1111: 536] 576 samples
; granule 2
;   frame_buf[1151: 520] int_buf[1783:1152] saved_frame[1111: 480] 576+56 samples

; 4th part:
;       input:    (mode_gr == 1)           (mode_gr == 2)
;           ecx = 576                      632(576+56)
;           edx = 480+96+480+56            480+672+480
;           edi = &int_buf[-480]           &int_buf[-480]
;           esi = &frame_buf[-1112]        &frame_buf[-1112]
;           ebp = &saved_frame[-480-96]    &saved_frame[-480-672]
;       output:
;           ecx = 0                        0
;           edx = 480+96+480+56+576        480+672+480+632

		align	16
.part4.lp0:
.part4.f0:
		stereo4	esi+edx*4+0,edi+edx*4+0,ebp+edx*4+0
		add		edx,4
%if 1
		dec		ecx
		jnz		.part4.lp0
%else
		loop	.part4.lp0
%endif
;;
		pop		ebp
		pop		ebx
		pop		edi
		pop		esi
		emms
		ret
%undef	saved_frame

;
;		2000/04/18	SMPframe_shiftin_mono
;
; EDX is simply incremented. Source(=ESI) is changed accrding to EDX.
; This routine is split into 4 parts.
%define	omit576
;
;void frame_shiftin_mono_SSE(int (*int_buf)[1152+576+EXTRADELAY], short *frame_buf, float (*flt_buf)[1152+HAN_SIZE], int mode_gr)
%define	saved_frame	sbd_xxx
%define	sizeof_flt_buf	(1152+HAN_SIZE)*4
%define	sizeof_int_buf	(1152+576+EXTRADELAY)*4
		align	16
frame_shiftin_mono_SSE:
		push	esi
		push	edi
		push	ebx
		push	ebp
%define	_P	4*4
		mov		eax,16
		movd	mm7,eax				; mm7 is used in mono4 macro
		xor		edx,edx

;;
%ifndef	omit576
; saved_frame[ 479:   0] flt_buf[1184:1663] 480 samples.
.part1:
		mov		ebx,[esp+_P+12]		; = flt_buf
		mov		ecx,480
		add		ebx,(1663-31-480)*4	; = &flt_buf[1663-31-480]
		mov		esi,saved_frame
		jmp		short .part1.f0
%else
.part1:

		mov		eax,[esp+_P+16]		; = mode_gr
		mov		ebx,[esp+_P+12]		; = flt_buf
		cmp		al,2
		mov		esi,saved_frame
		je		.part1.gr2
; saved_frame[ 479:   0] flt_buf[1184:1663] 480 samples.
.part1.gr1:
		mov		ecx,480
		add		ebx,(1663-31-480)*4	; = &flt_buf[1663-31-480]
		jmp		short .part1.f0

; saved_frame[ 479:   0] flt_buf[1184:1663] 480 samples.
; saved_frame[1055: 480] flt_buf[ 608:1183] 576 samples
.part1.gr2:
		mov		ecx,480+576
		add		ebx,(1663-31-480-576)*4	; = &flt_buf[1663-31-480-576]
		jmp		short .part1.f0
%endif
;saved_frame[ 479:   0] flt_buf[1184:1663]                                           480 samples
; 1st part:
;   index for flt_buf is decremented.
;       input:
;           ecx = 480
;           edx = 0
;           ebx = &flt_buf[1663-31-480]
;           esi = &saved_frame[0]
;       output:
;           ecx = 0
;           edx = 480

		align	16
.part1.lp0:
.part1.f0:
		mono4	esi+edx*2+12*2,none,none,xmm0,xmm2
		compose2type1	xmm0,xmm2,xmm6	; get L[31,30,29,16],L[xx,xx,xx,16]

		mono4	esi+edx*2+ 8*2,none,none,xmm4,xmm2
		compose2type1	xmm4,xmm2,xmm0	; get L[27,26,25,28],L[31,30,29,28]
		movaps	[ebx+ecx*4+28*4],xmm0

		mono4	esi+edx*2+ 4*2,none,none,xmm0,xmm2
		compose2type1	xmm0,xmm2,xmm4	; get L[23,22,21,24],L[27,26,25,24]
		movaps	[ebx+ecx*4+24*4],xmm4

		mono4	esi+edx*2+ 0*2,none,none,xmm4,xmm2
		compose2type1	xmm4,xmm2,xmm0	; get L[19,18,17,20],L[23,22,21,20]
		movaps	[ebx+ecx*4+20*4],xmm0

		movss	xmm4,xmm6				; get L[19,18,17,16]
		movaps	[ebx+ecx*4+16*4],xmm4

		mono4	esi+edx*2+16*2,none,none,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+12*4],xmm2

		mono4	esi+edx*2+20*2,none,none,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+8*4],xmm2

		mono4	esi+edx*2+24*2,none,none,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+4*4],xmm2

		mono4	esi+edx*2+28*2,none,none,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+0*4],xmm2

		add		edx,32
		sub		ecx,32
		jnz		near .part1.lp0

;;
%ifndef	omit576
.part2:
		mov		ebp,[esp+_P+8]		; = frame_buf
		mov		edi,[esp+_P+4]		; = int_buf
		mov		eax,[esp+_P+16]		; = mode_gr
		sub		ebp,1112*2			; = &frame_buf[-1112]
		sub		edi,480*4			; = &int_buf[-480]
		cmp		al,2
		jne		.part2.gr1
; granule 2
;saved_frame[1055: 480] flt_buf[ 608:1183] int_buf[ 575:   0] 576 samples
;saved_frame[1111:1056] flt_buf[ 552: 607] int_buf[ 631: 576] 56 samples
;  frame_buf[  39:   0] flt_buf[ 512: 551] int_buf[ 671: 632] 40(96-56)samples
.part2.gr2:
		sub		ebx,672*4			; = &flt_buf[1152-672]
		mov		ecx,672
		jmp		short .part2.f0

; granule 1
;saved_frame[ 575: 480] flt_buf[1088:1183] int_buf[  95:   0] 96 samples
.part2.gr1
		sub		ebx,96*4			; = &flt_buf[1152-96]
		mov		ecx,96
		jmp		short .part2.f0
%else
.part2:
		mov		ebp,[esp+_P+8]		; = frame_buf
		mov		edi,[esp+_P+4]		; = int_buf
		sub		ebp,1112*2			; = &frame_buf[-1112]
		sub		edi,480*4			; = &int_buf[-480]
		sub		ebx,96*4			; = &flt_buf[1152-576-96] or &flt_buf[1152-96]
		mov		ecx,96
		jmp		short .part2.f0
; granule 2
;saved_frame[1111:1056] flt_buf[ 552: 607] int_buf[ 631: 576] 56 samples
;  frame_buf[  39:   0] flt_buf[ 512: 551] int_buf[ 671: 632] 40(96-56)samples

; granule 1
;saved_frame[ 575: 480] flt_buf[1088:1183] int_buf[  95:   0] 96 samples
%endif

; 2nd part:
;       input:    (mode_gr == 1)           (mode_gr == 2)
;           ecx = 96                       672(576+96)
;           edx = 480                      480
;           ebx = &flt_buf[1152-96]        &flt_buf[1152-672]
;           edi = &int_buf[-480]           &int_buf[-480]
;           esi = &saved_frame[0]          &saved_frame[0]
;           ebp =                          &frame_buf[-1112]
;       output:
;           ecx = 0                        0
;           edx = 480+96                   480+672

		align	16
.part2.lp0:
.part2.f0:
		mono4	esi+edx*2+12*2,edi+edx*4+12*4,none,xmm0,xmm2
		compose2type1	xmm0,xmm2,xmm6	; get L[31,30,29,16],L[xx,xx,xx,16]

		mono4	esi+edx*2+ 8*2,edi+edx*4+ 8*4,none,xmm4,xmm2
		compose2type1	xmm4,xmm2,xmm0	; get L[27,26,25,28],L[31,30,29,28]
		movaps	[ebx+ecx*4+28*4],xmm0

		mono4	esi+edx*2+ 4*2,edi+edx*4+ 4*4,none,xmm0,xmm2
		compose2type1	xmm0,xmm2,xmm4	; get L[23,22,21,24],L[27,26,25,24]
		movaps	[ebx+ecx*4+24*4],xmm4

		mono4	esi+edx*2+ 0*2,edi+edx*4+ 0*4,none,xmm4,xmm2
		compose2type1	xmm4,xmm2,xmm0	; get L[19,18,17,20],L[23,22,21,20]
		movaps	[ebx+ecx*4+20*4],xmm0

		movss	xmm4,xmm6				; get L[19,18,17,16]
		movaps	[ebx+ecx*4+16*4],xmm4

		mono4	esi+edx*2+16*2,edi+edx*4+16*4,none,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+12*4],xmm2

		mono4	esi+edx*2+20*2,edi+edx*4+20*4,none,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+8*4],xmm2

		cmp		edx,480+576+32
		cmovae	esi,ebp

		mono4	esi+edx*2+24*2,edi+edx*4+24*4,none,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+4*4],xmm2

		mono4	esi+edx*2+28*2,edi+edx*4+28*4,none,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+0*4],xmm2

		add		edx,32
		sub		ecx,32
		jnz		near .part2.lp0

;;
.part3:
		mov		ecx,480
		mov		ebp,saved_frame
		shl		edx,1
		sub		ebp,edx				; = &saved_frame[-480-96] or &saved_frame[-480-672]
		sub		ebx,480*4			; = &flt_buf[1152-96-480] or &flt_buf[1152-672-480]
		shr		edx,1
		jmp		short .part3.f0
; granule 2
; frame_buf[ 519:  40] flt_buf[  32: 511] int_buf[1151: 672] saved_frame[ 479:   0] 480 samples

; granule 1
; saved_frame[1055: 576] flt_buf[ 608:1087] int_buf[ 575:  96] saved_frame[ 479:   0] 480 samples

; 3rd part:
;       input:    (mode_gr == 1)           (mode_gr == 2)
;           ecx = 480                      480
;           edx = 480+96                   480+672
;           ebx = &flt_buf[1152-96-480]    &flt_buf[1152-672-480]
;           edi = &int_buf[-480]           &int_buf[-480]
;           esi = &saved_frame[0]          &frame_buf[-1112]
;           ebp = &saved_frame[-480-96]    &saved_frame[-480-672]
;       output:
;           ecx = 0                        0
;           edx = 480+96+480               480+672+480
;
		align	16
.part3.lp0:
.part3.f0:

		mono4	esi+edx*2+12*2,edi+edx*4+12*4,ebp+edx*2+12*2,xmm0,xmm2
		compose2type1	xmm0,xmm2,xmm6	; get L[31,30,29,16],L[xx,xx,xx,16]

		mono4	esi+edx*2+ 8*2,edi+edx*4+ 8*4,ebp+edx*2+ 8*2,xmm4,xmm2
		compose2type1	xmm4,xmm2,xmm0	; get L[27,26,25,28],L[31,30,29,28]
		movaps	[ebx+ecx*4+28*4],xmm0

		mono4	esi+edx*2+ 4*2,edi+edx*4+ 4*4,ebp+edx*2+ 4*2,xmm0,xmm2
		compose2type1	xmm0,xmm2,xmm4	; get L[23,22,21,24],L[27,26,25,24]
		movaps	[ebx+ecx*4+24*4],xmm4

		mono4	esi+edx*2+ 0*2,edi+edx*4+ 0*4,ebp+edx*2+ 0*2,xmm4,xmm2
		compose2type1	xmm4,xmm2,xmm0	; get L[19,18,17,20],L[23,22,21,20]
		movaps	[ebx+ecx*4+20*4],xmm0

		movss	xmm4,xmm6				; get L[19,18,17,16]
		movaps	[ebx+ecx*4+16*4],xmm4

		mono4	esi+edx*2+16*2,edi+edx*4+16*4,ebp+edx*2+16*2,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+12*4],xmm2

		mono4	esi+edx*2+20*2,edi+edx*4+20*4,ebp+edx*2+20*2,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+8*4],xmm2

		mono4	esi+edx*2+24*2,edi+edx*4+24*4,ebp+edx*2+24*2,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+4*4],xmm2

		mono4	esi+edx*2+28*2,edi+edx*4+28*4,ebp+edx*2+28*2,xmm0,xmm2
		compose2type2	xmm0,xmm2
		movaps	[ebx+ecx*4+0*4],xmm2

		add		edx,32
		sub		ecx,32
		jnz		near .part3.lp0

;;
.part4:
		mov		eax,[esp+_P+16]		; = mode_gr
		cmp		al,2
		je		.part4.gr2
.part4.gr1:
; granule 1
; saved_frame[1111:1056] int_buf[ 631: 576] saved_frame[ 535: 480] 56 samples

; 4th part:
;       input:    (mode_gr == 1)
;           ecx = 56
;           edx = 480+96+480
;           edi = &int_buf[-480]
;           esi = &saved_frame[0]
;           ebp = &saved_frame[-480-96]
;       output:
;           ecx = 0
;           edx = 480+96+480+56

		mov		ecx,56/4
		jmp		short .part4.gr1.f0

		align	16
.part4.gr1.lp0:
.part4.gr1.f0:
		mono4	esi+edx*2,edi+edx*4,ebp+edx*2	; saved_frame, int_buf, saved_frame
		add		edx,4
		loop	.part4.gr1.lp0
;;
		mov		esi,[esp+_P+8]		; frame_buf
		sub		esi,1112*2
		mov		ecx,576/4
		jmp		short .part4.f0

.part4.gr2:
		mov		ecx,632/4
		jmp		short .part4.f0

; granule 1
;   frame_buf[ 576:   0] int_buf[1207: 632] saved_frame[1111: 536] 576 samples
; granule 2
;   frame_buf[1151: 520] int_buf[1783:1152] saved_frame[1111: 480] 576+56 samples

; 4th part:
;       input:    (mode_gr == 1)           (mode_gr == 2)
;           ecx = 576                      632(576+56)
;           edx = 480+96+480+56            480+672+480
;           edi = &int_buf[-480]           &int_buf[-480]
;           esi = &frame_buf[-1112]        &frame_buf[-1112]
;           ebp = &saved_frame[-480-96]    &saved_frame[-480-672]
;       output:
;           ecx = 0                        0
;           edx = 480+96+480+56+576        480+672+480+632

		align	16
.part4.lp0:
.part4.f0:
		mono4	esi+edx*2,edi+edx*4,ebp+edx*2
		add		edx,4
%if 1
		dec		ecx
		jnz		.part4.lp0
%else
		loop	.part4.lp0
%endif
;;
		pop		ebp
		pop		ebx
		pop		edi
		pop		esi
		emms
		ret
%undef	saved_frame

;
;		2000/03/27	
;
;void sbd_shiftin_SSE(int (*mfbuf)[1152+576+EXTRADELAY], int stereo, int mode_gr);
		align	16
sbd_shiftin_SSE:
		push	esi
		push	edi
%assign _P 2*4
		mov		edx,[esp+_P+12]		; = mode_gr
		imul	edx,18*32*F_SIZE
		mov		eax,[esp+_P+4]		; = mfbuf
		mov		edi,sbd_xxx+1152*4	; = &sbd_xxx[0][1152]

.lp0:
; memcpy(&sbd_xxx[ch][1152+32], &sbd_xxx[ch][1152+32-18*32*mode_gr], (512-32)*sizeof(float));
		mov		esi,edi
		sub		esi,edx
		mov		ecx,(512-32)
		jmp		short .f1

		align	16
.f1:
.lp1:
		movaps	xmm0,[esi + ecx*4 +  0*F_SIZE]
		movaps	[edi + ecx*4 +  0*F_SIZE],xmm0
		movaps	xmm7,[esi + ecx*4 +  4*F_SIZE]
		movaps	[edi + ecx*4 +  4*F_SIZE],xmm7
		movaps	xmm6,[esi + ecx*4 +  8*F_SIZE]
		movaps	[edi + ecx*4 +  8*F_SIZE],xmm6
		movaps	xmm5,[esi + ecx*4 + 12*F_SIZE]
		movaps	[edi + ecx*4 + 12*F_SIZE],xmm5
		movaps	xmm4,[esi + ecx*4 + 16*F_SIZE]
		movaps	[edi + ecx*4 + 16*F_SIZE],xmm4
		movaps	xmm3,[esi + ecx*4 + 20*F_SIZE]
		movaps	[edi + ecx*4 + 20*F_SIZE],xmm3
		movaps	xmm2,[esi + ecx*4 + 24*F_SIZE]
		movaps	[edi + ecx*4 + 24*F_SIZE],xmm2
		movaps	xmm1,[esi + ecx*4 + 28*F_SIZE]
		movaps	[edi + ecx*4 + 28*F_SIZE],xmm1
		sub		ecx,32
		jnz		.lp1

		mov		esi,eax
		mov		ecx,edx
		sub		edi,edx
		jmp		short .f2

		align	16
.f2:
.lp2:
;	    sbd_xxx[31:28] = c[17:20] = win_buf[14:11];
;	    sbd_xxx[27:24] = c[21:24] = win_buf[10: 7];
;	    sbd_xxx[23:20] = c[25:28] = win_buf[ 6: 3];
;	    sbd_xxx[19:16] = {c[29:31],c[16]} = {win_buf[ 2: 0],win_buf[   15]};
;	    sbd_xxx[12:15] = c[12:15] = win_buf[19:16];
;	    sbd_xxx[ 8:11] = c[ 8:11] = win_buf[23:20];
;	    sbd_xxx[ 4: 7] = c[ 4: 7] = win_buf[27:24];
;	    sbd_xxx[ 0: 3] = c[ 0: 3] = win_buf[31:28];

		cvtpi2ps	xmm0,[esi +  1*4]
		cvtsi2ss	xmm1,[esi +  0*4]
		shufps	xmm1,xmm0,01000000B
		cvtsi2ss	xmm1,[esi + 15*4]
		movaps	[edi+ecx+16*F_SIZE],xmm1

		cvtpi2ps	xmm2,[esi +  5*4]
		cvtpi2ps	xmm3,[esi +  3*4]
		movlhps	xmm3,xmm2
		movaps	[edi+ecx+20*F_SIZE],xmm3

		cvtpi2ps	xmm4,[esi +  9*4]
		cvtpi2ps	xmm5,[esi +  7*4]
		movlhps	xmm5,xmm4
		movaps	[edi+ecx+24*F_SIZE],xmm5

		cvtpi2ps	xmm6,[esi + 13*4]
		cvtpi2ps	xmm7,[esi + 11*4]
		movlhps	xmm7,xmm6
		movaps	[edi+ecx+28*F_SIZE],xmm7

		cvtpi2ps	xmm0,[esi + 16*4]
		cvtpi2ps	xmm1,[esi + 18*4]
		shufps	xmm1,xmm0,0x11
		movaps	[edi+ecx+12*F_SIZE],xmm1

		cvtpi2ps	xmm2,[esi + 20*4]
		cvtpi2ps	xmm3,[esi + 22*4]
		shufps	xmm3,xmm2,0x11
		movaps	[edi+ecx+ 8*F_SIZE],xmm3

		cvtpi2ps	xmm4,[esi + 24*4]
		cvtpi2ps	xmm5,[esi + 26*4]
		shufps	xmm5,xmm4,0x11
		movaps	[edi+ecx+ 4*F_SIZE],xmm5

		cvtpi2ps	xmm6,[esi + 28*4]
		cvtpi2ps	xmm7,[esi + 30*4]
		shufps	xmm7,xmm6,0x11
		movaps	[edi+ecx+ 0*F_SIZE],xmm7

		add		esi,32*4
		sub		ecx,32*4
		jnz		near .lp2

		cmp		byte [esp+_P+8],2
		jne		.exit

		add		edi,edx
		add		eax,(1152+576+EXTRADELAY)*F_SIZE
		add		edi,(1152+HAN_SIZE)*F_SIZE
		cmp		edi,sbd_xxx+(1152+HAN_SIZE)*F_SIZE*2
		jb		near .lp0

.exit:
		pop	edi
		pop	esi
		ret

;
;		2000/03/27	sԂ1...
;
;void sbd_shiftin_SSE_MULTI(int (*mfbuf)[1152+576+EXTRADELAY], int stereo, int mode_gr);
sbd_shiftin_SSE_MULTI:
		push	esi
		push	edi
%assign _P 2*4
		mov		edx,[esp+_P+12]		; = mode_gr
		imul	edx,18*32*F_SIZE
		mov		eax,[esp+_P+4]		; = mfbuf
		mov		edi,sbd_xxx+1152*4	; = &sbd_xxx[0][1152]

.lp0:
; memcpy(&sbd_xxx[ch][1152+32], &sbd_xxx[ch][1152+32-18*32*mode_gr], (512-32)*sizeof(float));
		mov		esi,edi
		sub		esi,edx				; = &sbd_xxx[ch][1152-18*32*mode_gr]
		mov		ecx,(512-32)
		jmp		short .f1

		align	16
.f1:
.lp1:
		prefetchnta	[esi + ecx*4 + 24*F_SIZE]
		movaps	xmm0,[esi + ecx*4 +  0*F_SIZE]
		movntps	[edi + ecx*4 +  0*F_SIZE],xmm0
		movaps	xmm7,[esi + ecx*4 +  4*F_SIZE]
		movntps	[edi + ecx*4 +  4*F_SIZE],xmm7
		prefetchnta	[esi + ecx*4 - 32*F_SIZE]
		movaps	xmm6,[esi + ecx*4 +  8*F_SIZE]
		movntps	[edi + ecx*4 +  8*F_SIZE],xmm6
		movaps	xmm5,[esi + ecx*4 + 12*F_SIZE]
		movntps	[edi + ecx*4 + 12*F_SIZE],xmm5
		prefetchnta	[esi + ecx*4 - 24*F_SIZE]
		movaps	xmm4,[esi + ecx*4 + 16*F_SIZE]
		movntps	[edi + ecx*4 + 16*F_SIZE],xmm4
		movaps	xmm3,[esi + ecx*4 + 20*F_SIZE]
		movntps	[edi + ecx*4 + 20*F_SIZE],xmm3
		prefetchnta	[esi + ecx*4 - 16*F_SIZE]
		movaps	xmm2,[esi + ecx*4 + 24*F_SIZE]
		movntps	[edi + ecx*4 + 24*F_SIZE],xmm2
		movaps	xmm1,[esi + ecx*4 + 28*F_SIZE]
		movntps	[edi + ecx*4 + 28*F_SIZE],xmm1
		sub		ecx,32
		jnz		.lp1

		mov		esi,eax
		mov		ecx,edx
		sub		edi,edx
		jmp		short .f2

		align	16
.f2:
.lp2:
;	    sbd_xxx[31:28] = c[17:20] = win_buf[14:11];
;	    sbd_xxx[27:24] = c[21:24] = win_buf[10: 7];
;	    sbd_xxx[23:20] = c[25:28] = win_buf[ 6: 3];
;	    sbd_xxx[19:16] = {c[29:31],c[16]} = {win_buf[ 2: 0],win_buf[   15]};
;	    sbd_xxx[12:15] = c[12:15] = win_buf[19:16];
;	    sbd_xxx[ 8:11] = c[ 8:11] = win_buf[23:20];
;	    sbd_xxx[ 4: 7] = c[ 4: 7] = win_buf[27:24];
;	    sbd_xxx[ 0: 3] = c[ 0: 3] = win_buf[31:28];

		prefetchnta	[esi +  0*4]

		cvtpi2ps	xmm0,[esi + 28*4]
		cvtpi2ps	xmm1,[esi + 30*4]
		shufps	xmm1,xmm0,0x11

		cvtpi2ps	xmm2,[esi + 24*4]
		cvtpi2ps	xmm3,[esi + 26*4]
		shufps	xmm3,xmm2,0x11

		prefetchnta	[esi +  8*4]

		cvtpi2ps	xmm4,[esi + 20*4]
		cvtpi2ps	xmm5,[esi + 22*4]
		shufps	xmm5,xmm4,0x11

		cvtpi2ps	xmm6,[esi + 16*4]
		cvtpi2ps	xmm7,[esi + 18*4]
		shufps	xmm7,xmm6,0x11
		movntps	[edi+ecx+ 0*F_SIZE],xmm1
		movntps	[edi+ecx+ 4*F_SIZE],xmm3
		movntps	[edi+ecx+ 8*F_SIZE],xmm5
		movntps	[edi+ecx+12*F_SIZE],xmm7

		prefetchnta	[esi + 48*4]

		cvtsi2ss	xmm1,[esi +  0*4]
		cvtpi2ps	xmm0,[esi +  1*4]
		shufps	xmm1,xmm0,01000000B

		cvtpi2ps	xmm3,[esi +  3*4]
		cvtpi2ps	xmm2,[esi +  5*4]
		movlhps	xmm3,xmm2

		prefetchnta	[esi + 56*4]

		cvtpi2ps	xmm5,[esi +  7*4]
		cvtsi2ss	xmm1,[esi + 15*4]
		cvtpi2ps	xmm4,[esi +  9*4]
		movlhps	xmm5,xmm4

		cvtpi2ps	xmm7,[esi + 11*4]
		cvtpi2ps	xmm6,[esi + 13*4]
		movlhps	xmm7,xmm6
		movntps	[edi+ecx+16*F_SIZE],xmm1
		movntps	[edi+ecx+20*F_SIZE],xmm3
		movntps	[edi+ecx+24*F_SIZE],xmm5
		movntps	[edi+ecx+28*F_SIZE],xmm7

		add		esi,32*4
		sub		ecx,32*4
		jnz		near .lp2

		cmp		byte [esp+_P+8],2
		jne		.exit

		add		edi,edx
		add		eax,(1152+576+EXTRADELAY)*F_SIZE
		add		edi,(1152+HAN_SIZE)*F_SIZE
		cmp		edi,sbd_xxx+(1152+HAN_SIZE)*F_SIZE*2
		jb		near .lp0

.exit:
		pop	edi
		pop	esi
		ret

;;

;	1999/07/19	SSEΉ window_filter_subbandKNI()
;	1999/07/21	117k[clk]@PIII
;	1999/07/22 	loop unrolling, 102k[clk]@PIII
;	1999/07/23	WsS񏈗ɕוςA90k`85k[clk]
;	1999/09/01	LSF(22.05kHz)т̃GR[hΉ̂ mode_gr Ɏ悤ύX
;	1999/09/03	improved iDCT routine, 68k[clk]@PIII
;	1999/09/04	register allocation ς, 62k[clk]@PIII
;	1999/09/29	load/store炵AԂ͕ς炸
;	1999/11/14	movups𖳂B59k[clk]@PIII
;   2000/03/07	ZȂB55.4k[clk]@͓->53.8k[clk]@͓
;   2000/03/15	(;_;BoOCB
;   2000/03/16	enwindow̃[h񐔔 ->45.3k[clk]@͓
;   2000/03/17	ƍœK ->44.9k[clk]@͓
;   2000/03/21	SCALERenwindowɂ炩ߊ|Ă ->44.6k[clk]@͓
;   2000/03/24	sbd_shiftin𕪗 41k[clk]@͓
;   2000/03/29	eax]Ăij
;	2000/04/17	SMPœK̂߂̕z by K.SAKAI
; void window_filter_subband_SSE(float *win_buf, float *s, int mode_gr)
		align	16
window_filter_subband_SSE:
		push	esi
		push	edi
		push	ebp
%assign _P 3*4
		mov		edi,[esp+_P+4]		; = win_buf
		mov		ebp,[esp+_P+8]		; = s
		mov		esi,[esp+_P+12]		; = mode_gr(=1 or 2)
		add		edi,1152*F_SIZE		; = c = &win_buf[1152]
		add		esi,esi
		lea		esi,[esi+esi*8]		; = j = 18*mode_gr

; allocate yyy[32]
		mov		edx,esp
		sub		esp,32*4+4
		and		esp,~31				; align to 32 byte boundary
		mov		[esp+32*4],edx		; save the original ESP
		jmp		short .lp0
%define	yyy esp

		align	16
.lp0:
;   a  =c[64*0 + i   ]*enwindow[i/4  ][0][3:1];
;   y[16]  = c[64*0]*enwindow[0][0][0]
;
;   b  =c[64*7 + 64-i]*enwindow[i/4  ][0][3:1]; // t]
;
;   d  =c[64*0 + i+32]*enwindow[i/4+8][0][3:1];
;   y[16] += c[64*0+32]*enwindow[8][0][0]
;
;   c -=c[64*7 + 32-i]*enwindow[i/4+8][0][3:1]; // t]
;   y[ 0]  = c[64*7+16]*enwindow[4][7][0];

		movaps	xmm0,[enwindow           + 0*4*4]
		movaps	xmm3,[enwindow + 8*8*4*4 + 0*4*4]
		movaps	xmm4,[edi + (64*0 +  0)*4]
		movaps	xmm5,[edi + (64*7 + 48)*4]
		mulps	xmm4,xmm0
		mulps	xmm5,xmm0
		movss	xmm0,[enwindow + 4*8*4*4 + 7*4*4] ; = enwindow[4][7][0]
		movaps	xmm7,[edi + (64*0 + 32)*4]
		movaps	xmm6,[edi + (64*7 + 16)*4]	; lowest scaler is c[64*%2+16]
		mulps	xmm7,xmm3
		movss	xmm3,xmm0	; replace lowest coeficient with enwindow[4][%2][0]
		mulps	xmm6,xmm3

		movaps	xmm2,[enwindow + 8*8*4*4 + 7*4*4]
		movaps	xmm3,[edi + (64*7 + 32)*4]
		mulps	xmm3,xmm2
		movss	xmm0,[enwindow + 4*8*4*4 + 0*4*4] ; = enwindow[4][0][0]
		movss	xmm2,xmm0	; replace lowest coeficient with enwindow[4][0][0]
		mulps	xmm2,[edi + (64*0 + 16)*4]	; lowest scaler is c[64*0+16]
		addps	xmm7,xmm3
		addps	xmm6,xmm2
		movaps	xmm1,[enwindow           + 7*4*4]
		movaps	xmm0,[edi + (64*7 +  0)*4]
		mulps	xmm0,xmm1
		mulps	xmm1,[edi + (64*0 + 48)*4]
		addps	xmm4,xmm0
		addps	xmm5,xmm1

;   a +=c[64*%1 + i   ]*enwindow[0  ][%1][3:1];
;   y[16] += c[64*%1]*enwindow[0][%1][0]
;
;   b +=c[64*%2 + 64-i]*enwindow[0  ][%1][3:1]; // t]
;
;   d +=c[64*%1 + i+32]*enwindow[8][%1][3:1];
;   y[16] += c[64*%1+32]*enwindow[8][%1][0]
;
;   c +=c[64*%2 + 32-i]*enwindow[8][%1][3:1]; // t]
;   y[ 0] += c[64*%2+16]*enwindow[4][%2][0]

%macro	window_filter_type1	2
		movaps	xmm1,[enwindow           + %1*4*4]
		movaps	xmm0,[edi + (64*%1 +  0)*4]
		mulps	xmm0,xmm1
		mulps	xmm1,[edi + (64*%2 + 48)*4]
		addps	xmm4,xmm0
		addps	xmm5,xmm1
		movaps	xmm2,[enwindow + 8*8*4*4 + %1*4*4]
		movaps	xmm3,[edi + (64*%1 + 32)*4]
		mulps	xmm3,xmm2
		movss	xmm0,[enwindow + 4*8*4*4 + %2*4*4] ; = enwindow[4][%2][0]
		movss	xmm2,xmm0	; replace lowest coeficient with enwindow[4][%2][0]
		mulps	xmm2,[edi + (64*%2 + 16)*4]	; lowest scaler is c[64*%2+16]
		addps	xmm7,xmm3
		addps	xmm6,xmm2
%endmacro

;		window_filter_type1	7,0
		window_filter_type1	1,6
		window_filter_type1	6,1
		window_filter_type1	2,5
		window_filter_type1	5,2
		window_filter_type1	3,4
		window_filter_type1	4,3

		movss	[yyy+12*4],xmm6
		addps	xmm5,xmm7	; b = xmm5 = {yyy[19],yyy[18],yyy[17],---}
		addss	xmm7,xmm4	; = yyy[16]
		subps	xmm4,xmm6	; a = xmm4 = {yyy[13],yyy[14],yyy[15],---}
		movss	xmm5,xmm7
		movaps	[yyy+16*4],xmm5
		movaps	xmm1,xmm4	; it is stored in later

		mov		ecx,4*4
		jmp		short .f1
;				for(i = 4; i < 16; i++){
;                        register float  a, b;
		align	16
.lp1:
.f1:
;      { 4,  5,  6,  7}, { 8,  9, 10, 11}, {12, 13, 14, 15}
;      {28, 27, 26, 25}, {24, 23, 22, 21}, {20, 19, 18, 17}

;   a  =c[64*0 + i   ]*enwindow[i/4  ][0][i%4];
;   b  =c[64*7 + 64-i]*enwindow[i/4  ][0][i%4]; // t]
;   a -=c[64*7 + 32-i]*enwindow[i/4+8][0][i%4]; // t]
;   b +=c[64*0 + i+32]*enwindow[i/4+8][0][i%4];

		movaps	xmm6,[edi + ecx + (64*0 +  0)*4]	; = a
		movaps	xmm7,[edi + ecx + (64*7 + 48)*4]	; = b
		movaps	xmm0,[enwindow + ecx*8           + 0*4*4]
		mulps	xmm6,xmm0
		mulps	xmm7,xmm0
		movaps	xmm4,[edi + ecx + (64*7 + 16)*4]
		movaps	xmm5,[edi + ecx + (64*0 + 32)*4]
		movaps	xmm0,[enwindow + ecx*8 + 8*8*4*4 + 0*4*4]
		mulps	xmm4,xmm0
		mulps	xmm5,xmm0
		subps	xmm6,xmm4
		addps	xmm7,xmm5

;   a +=c[64*%1 + i   ]*enwindow[i/4  ][%1][3:0];
;   b +=c[64*%2 + 64-i]*enwindow[i/4  ][%1][3:0]; // t]
;   a -=c[64*%2 + 32-i]*enwindow[i/4+8][%1][3:0]; // t]
;   b +=c[64*%1 + i+32]*enwindow[i/4+8][%1][3:0];
%macro	window_filter_type2	2
		movaps	xmm2,[edi + ecx + (64*%1 +  0)*4]
		movaps	xmm3,[enwindow + ecx*8           + %1*4*4]
		mulps	xmm2,xmm3
		mulps	xmm3,[edi + ecx + (64*%2 + 48)*4]
		addps	xmm6,xmm2
		addps	xmm7,xmm3
		movaps	xmm4,[edi + ecx + (64*%2 + 16)*4]
		movaps	xmm5,[enwindow + ecx*8 + 8*8*4*4 + %1*4*4]
		mulps	xmm4,xmm5
		mulps	xmm5,[edi + ecx + (64*%1 + 32)*4]
		subps	xmm6,xmm4
		addps	xmm7,xmm5
%endmacro

		window_filter_type2	7,0
		window_filter_type2	1,6
		window_filter_type2	6,1
		window_filter_type2	2,5
		window_filter_type2	5,2
		window_filter_type2	3,4
		window_filter_type2	4,3

;    	y[16-i] = a;	// i[͂SvfƂɋt
;       y[i+16] = b;
		movaps	[yyy + ecx + 16*4],xmm7
		movss	xmm1,xmm6
		shufps	xmm1,xmm1,01101100B
		movaps	[yyy + ecx - 4*4],xmm1
		movaps	xmm1,xmm6

		add		ecx,4*4
		cmp		ecx,16*4
		jb		near .lp1

		movss	xmm0,[yyy+12*4]
		movss	xmm1,xmm0
		shufps	xmm1,xmm1,01101100B
		movaps	[yyy+12*4],xmm1

;;
;       for( i=0; i<16; i++ ){
;               s0 = s1 = 0.0;
;               for( j=0; j<32; j+=4 ){
;                       s0 += (*m)[i+0][j  ]*yprime[j+0];
;                       s1 += (*m)[i+0][j+1]*yprime[j+1];
;                       s0 += (*m)[i+0][j+2]*yprime[j+2];
;                       s1 += (*m)[i+0][j+3]*yprime[j+3];
;               }
;               xout[i+ 0] = s0+s1;
;               xout[31-i] = s0-s1;
;       }
		movaps	xmm3,[yyy+ 0*4]		; = yprime[15:12]
		movaps	xmm4,[yyy+16*4]
		movaps	xmm5,[yyy+20*4]
		movaps	xmm6,[yyy+24*4]
		movaps	xmm7,[yyy+28*4]

		sub		edi,32*4	; c -= 32

		mov		ecx,16
		mov		edx,idct_coefficient+16*F_SIZE
		jmp		short .lp5

		align	16
.lp5:
		movaps	xmm0,[edx-64]
		mulps	xmm0,[yyy+12*4]	; = yprime[ 3: 0]

		movaps	xmm1,[edx-48]
		mulps	xmm1,[yyy+ 8*4]	; = yprime[ 7: 4]
		addps	xmm1,xmm0

		movaps	xmm2,[edx-32]
		mulps	xmm2,[yyy+ 4*4]	; = yprime[11: 8]
		addps	xmm2,xmm1

		movaps	xmm0,[edx-16]
		mulps	xmm0,xmm3
		addps	xmm0,xmm2

		movaps	xmm1,[edx+ 0]
		mulps	xmm1,xmm4
		addps	xmm0,xmm1

		movaps	xmm2,[edx+16]
		mulps	xmm2,xmm5
		addps	xmm0,xmm2

		movaps	xmm1,[edx+32]
		mulps	xmm1,xmm6
		addps	xmm0,xmm1

		movaps	xmm2,[edx+48]
		mulps	xmm2,xmm7
		addps	xmm0,xmm2

		add		edx,32*F_SIZE
		movhlps	xmm1,xmm0
		addps	xmm0,xmm1
		movaps	xmm1,xmm0
		shufps	xmm1,xmm0,0x01
		movaps	xmm2,xmm0
		addss	xmm0,xmm1
		subss	xmm2,xmm1

		movss	[ebp],xmm0
		add		ebp,4
		dec		ecx
		movss	[ebp+ecx*8],xmm2
		jnz		near .lp5

		add		ebp,16*4
		dec		esi
		jnz		near .lp0

; free area for yyy[32]
		mov		esp,[esp+32*4]
%undef yyy

		pop	ebp
		pop	edi
		pop	esi
		ret

		end
