/*******************************************************
                        PFTOOLS
 *******************************************************
  Sep 30, 2011 xalip.c
 *******************************************************
 (C) 2011 Swiss Institute of Bioinformatics
     Thierry Schuepbach (thierry.schuepbach@isb-sib.ch)
 *******************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include <stdbool.h>

#include "profile.h"

#define MAX(a,b) (a>b) ? a : b
#define MIN(a,b) (a<b) ? a : b

struct Minima {
   int _a;
   int _b;
   int _c;
};

static void InitR(const int iseq, const size_t N1, const size_t N2, const size_t bseq, const size_t lseq,
                  union lScores * const restrict iop, union Positions * const restrict iom,
                  union Positions * const restrict ioi,  const struct Profile * const restrict prf) 
{
	int KOPD;
#ifdef XALIP_DEBUG
	fprintf(stdout,"InitR called with index %i bseq %lu\n",iseq, bseq);
#endif
	// Are we treating sequence index below given start?
	register const ScoreTuple * restrict FirstSequenceProtein = ( iseq < bseq ) 
	   ? &(prf->Scores.Insertion.FirstSequenceProtein[0])
	   : &(prf->Scores.Insertion.Transitions->From[EXTRA]);
	const size_t FirstSequenceProteinAlignStep = (iseq < bseq) ? 1 : 4;
	  
  register int (* restrict pIOP)[4] = &(iop[0].Element);
  
  for (int i=0; i<4; ++i) {
    pIOP[0][i] = (int) FirstSequenceProtein->To[i];
  }
  
  // Set KOPD
  KOPD = pIOP[0][DELETION];

  register const short int * restrict pTransitions = &(prf->Scores.Insertion.Transitions[1].Element[_DM]);
  register const short int * restrict pMatch       = &prf->Scores.Match.Alphabet[_D]; 
  const size_t AlignStep = prf->Scores.Match.AlignStep;
  
  // Move to next profile First Sequence
  FirstSequenceProtein += FirstSequenceProteinAlignStep;
  
  for (unsigned int iprf=1; iprf<=(unsigned int) prf->Length; ++iprf) {
    register const int KD = KOPD + (int) *pMatch;
    pMatch += AlignStep;
    
    // Load Transitions
    int __Transitions[4];
    for (int i=0; i<4; ++i) __Transitions[i] = KD + (int) pTransitions[i];    
	  
    // Move to next profile transitions
    pTransitions += INSERTION_TRANSITIONS_SIZE;

    // Move to next profile First Sequence
    FirstSequenceProtein += FirstSequenceProteinAlignStep;
    
    // Get maximum
    int __FirstSequenceProtein[4];
    for (int i=0; i<4; ++i) {
      __FirstSequenceProtein[i] = (int) FirstSequenceProtein->To[i];
      pIOP[iprf][i] = (__Transitions[i] > __FirstSequenceProtein[i]) ? __Transitions[i] : __FirstSequenceProtein[i];
    }
    
    // Set KOPD ( this is SSE 4.1 )
    KOPD = pIOP[iprf][DELETION];
  } 
	      
  union Positions TPOS __attribute__((aligned(16)));
  TPOS.Element.One   = lseq + 1;
  TPOS.Element.Two   = 0;
  TPOS.Element.B     = iseq + 1;
  TPOS.Element.dummy = 0;
  
  for (unsigned int iprf=0; iprf<(unsigned int) (N1-1); ++iprf) {
    _mm_store_si128(&(iom[iprf].xmm), TPOS.xmm);
    _mm_store_si128(&(ioi[iprf].xmm), TPOS.xmm);
  }

  if (N1 == 0) { fputs("BUG HERE N1 is NULL\n", stderr); exit(1);}
  ioi[N1-1].xmm = TPOS.xmm; // Warning N1 > 0 ?
  TPOS.Element.One = TPOS.Element.B;
  TPOS.Element.Two = TPOS.Element.B;
  iom[N1-1].xmm = TPOS.xmm;

  for (unsigned int iprf=N1; iprf<(unsigned int)N2; ++iprf) {
    _mm_store_si128(&(iom[iprf].xmm), TPOS.xmm);
    _mm_store_si128(&(ioi[iprf].xmm), TPOS.xmm);
  } 
	 
  TPOS.Element.One = lseq + 1;
  TPOS.Element.Two = 0;
	
  for (unsigned int iprf=(unsigned int) N2; iprf<=(unsigned int)prf->Length; ++iprf) {
    _mm_store_si128(&(iom[iprf].xmm), TPOS.xmm);
    _mm_store_si128(&(ioi[iprf].xmm), TPOS.xmm);
  }
}

static void nextR(const struct Profile * const restrict prf, const unsigned char * const restrict Sequence,
                  const int iseq, union lScores * const restrict iop, union Positions * const restrict iom,
                  union Positions * const restrict ioi,const int lseq, struct Alignment * const restrict alignment,
                  struct Minima * const restrict ifer, const _Bool Lock, const size_t N1, const size_t N2)
{
#ifdef XALIP_DEBUG
   fprintf(stdout,"NextR called with iseq %i\n",iseq);
#endif
   // Initialization
   const unsigned int In = iseq + 1;
   // WARNING: Fortran uses a 1 based index for sequence
   const unsigned int SequenceIndex = (unsigned int) Sequence[iseq-1];
   
   if ( iseq >= lseq) {
      fputs("nextR_last should have been called\n", stderr);
      exit(1);   
   } 
   
   // Disable match and insert vertices of protected region
   if (Lock) {
      iop[N1-1].Element[MATCH] = NLOW;
      for (size_t iprf=N1; iprf<N2; ++iprf) {
         iop[iprf].Element[MATCH]     = NLOW;
         iop[iprf].Element[INSERTION] = NLOW;
      } 
   }
   ////////////////////////////////////////////////////////////////////////////////////////////
   // Profile position 0
   ////////////////////////////////////////////////////////////////////////////////////////////
   
   // Save previous match position
   int Kopm = iop[0].Element[MATCH];
   union Positions Kpos __attribute__((aligned(16)));
   Kpos.xmm = iom[0].xmm;
   
   const union Positions TEMPpos __attribute__((aligned(16))) = { lseq+1, 0, In, 0 };
   
   const union Positions * restrict PTRpos[4];
   union Positions Kiod;
   PTRpos[0] = &TEMPpos;
   PTRpos[1] = &Kpos;
   PTRpos[3] = &Kiod;
     
   // Get pointers to score data
   const TransitionScores * const restrict Transitions = prf->Scores.Insertion.Transitions;
   const short int * restrict Insertion = prf->Scores.Insertion.Alphabet;
   const size_t AlignStep = prf->Scores.Insertion.AlignStep;
   
   int Ki = iop[0].Element[INSERTION] + (int) Insertion[SequenceIndex];
   
   // Match position
   int Ji   = Ki + (int) Transitions[0].From[INSERTION].To[MATCH];
   int itmp = (int) Transitions[0].From[EXTRA].To[MATCH];
   if ( Ji > itmp) {
      iop[0].Element[MATCH] = Ji;
      iom[0].xmm = ioi[0].xmm;
   } else {
      iop[0].Element[MATCH] = itmp;
      iom[0].xmm = TEMPpos.xmm;
   }
   
   // Deletion position
   int Kopd;
   Ji   = Ki + (int) Transitions[0].From[INSERTION].To[DELETION];
   itmp = (int) Transitions[0].From[EXTRA].To[DELETION];
   if ( Ji > itmp ) {
      Kopd     = Ji;
      Kiod.xmm = ioi[0].xmm;
   } else {
      Kopd     = itmp;
      Kiod.xmm = TEMPpos.xmm; 
   } 

   // Insertion position
   Ji   = Ki + (int) Transitions[0].From[INSERTION].To[INSERTION];
   itmp = (int) Transitions[0].From[EXTRA].To[INSERTION];
   if ( Ji > itmp ) {
      iop[0].Element[INSERTION] = Ji;
   } else {
      iop[0].Element[INSERTION] = itmp;
      ioi[0].xmm = TEMPpos.xmm;
   }
   
   // Initialize minima
   ifer->_a = iseq;
   ifer->_b = iseq;
   itmp     = MIN(ioi[0].Element.B, iom[0].Element.B);
   ifer->_c = MIN(itmp, Kiod.Element.B); 
   
   // Initialize alignment
   union Positions Fpos __attribute__((aligned(16)));;
   Fpos.Element.One   = alignment->JAL1;
   Fpos.Element.Two   = alignment->JAL2;
   Fpos.Element.B     = alignment->JALB;
   Fpos.Element.dummy = 0;
   
   ////////////////////////////////////////////////////////////////////////////////////////////
   // Loop through rest of profile
   ////////////////////////////////////////////////////////////////////////////////////////////
   const short int * restrict Match = prf->Scores.Match.Alphabet;
   Insertion += AlignStep;
   
   for (int iprf=1; iprf<=prf->Length; ++iprf) {
      /////////////////////////////////////////////////////////////////////////////////////////
      // Match
      const register int KM = Kopm + (int) Match[SequenceIndex];
      Kopm = iop[iprf].Element[MATCH];
      
      __m128i __KM = _mm_set1_epi32(KM);
      // Load Transitions
      __m128i __TransitionsM = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[MATCH]));
      // Convert signed WORD into signed DWORD
      __TransitionsM = _mm_cvtepi16_epi32(__TransitionsM);
      // Add KM to Transitions
      __TransitionsM = _mm_add_epi32(__TransitionsM, __KM);
      

      /////////////////////////////////////////////////////////////////////////////////////////
      // Insertion
      const register int KI = iop[iprf].Element[INSERTION] + (int) Insertion[SequenceIndex]; 
      // one could move on the seq index instead
      
      __m128i __KI = _mm_set1_epi32(KI);
      // Load Transitions
      __m128i __TransitionsI = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[INSERTION]));
      // Convert signed WORD into signed DWORD
      __TransitionsI = _mm_cvtepi16_epi32(__TransitionsI);
      // Add KM to Transitions
      __TransitionsI = _mm_add_epi32(__TransitionsI, __KI);
      
      /////////////////////////////////////////////////////////////////////////////////////////
      // Deletion
      const register int KD = Kopd + (int) Match[_D];   
      
      __m128i __KD = _mm_set1_epi32(KD);
      // Load Transitions
      __m128i __TransitionsD = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[DELETION]));
      // Convert signed WORD into signed DWORD
      __TransitionsD = _mm_cvtepi16_epi32(__TransitionsD);
      // Add KM to Transitions
      __TransitionsD = _mm_add_epi32(__TransitionsD, __KD);
      
      /////////////////////////////////////////////////////////////////////////////////////////
      // Extensions
      // Load Transitions
      __m128i __TransitionsX = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[EXTRA]));
      // Convert signed WORD into signed DWORD
      __TransitionsX = _mm_cvtepi16_epi32(__TransitionsX);
      
      // Insert NLOW into Extension vector -> done in io.c
      //__TransitionsX = _mm_insert_epi32(__TransitionsX, NLOW, DUMMY);
      
      // Move to next profile index
      Match     += AlignStep;
      Insertion += AlignStep;

      /////////////////////////////////////////////////////////////////////////////////////////
      // Compute Maxima (Fortran Nstep function)
      __m128i __Index = (__m128i) _mm_setzero_si128();
      __m128i __three = _mm_set_epi32(3,3,3,3);
      
      __m128i  __Mask = _mm_cmpgt_epi32(__TransitionsD, __TransitionsX);
      __TransitionsX  = _mm_blendv_epi8(__TransitionsX, __TransitionsD, __Mask);
      __Index         = _mm_blendv_epi8(__Index, __three, __Mask);
      
      __m128i __One  = _mm_set_epi32(1,1,1,1);
      __Mask         = _mm_cmpgt_epi32(__TransitionsM, __TransitionsX);
      __TransitionsX = _mm_blendv_epi8(__TransitionsX, __TransitionsM, __Mask);
      __Index        = _mm_blendv_epi8(__Index, __One, __Mask);
      
      __m128i __Two  = _mm_add_epi32(__One, __One);
      __Mask         = _mm_cmpgt_epi32(__TransitionsI, __TransitionsX);
      __TransitionsX = _mm_blendv_epi8(__TransitionsX, __TransitionsI, __Mask);
      __Index        = _mm_blendv_epi8(__Index, __Two, __Mask);
      
      // Set new data
      iop[iprf].xmm = __TransitionsX;
      //StoreMatchInsertion((__m64*) &iop[iprf],(__m128) __TransitionsX);
      Kopd = _mm_extract_epi32(__TransitionsX, DELETION);

      /////////////////////////////////////////////////////////////////////////////////////////
      // Check for new maxima
      const int KE = _mm_extract_epi32(__TransitionsX, DUMMY);
      if (KE > alignment->JALS) {
         alignment->JALS = KE;
         alignment->JALE = iseq;
         Fpos.xmm = ioi[iprf].xmm;
         const unsigned int Id = (unsigned int) _mm_extract_epi32(__Index, DUMMY);
         if (Id == 1)  { // KM is max
               Fpos.xmm = Kpos.xmm;
         } else if (Id == 3) { // KD is max
               Fpos.xmm = Kiod.xmm;
         }
      }
#ifdef XALIP_DEBUG
      printf("FPOS %8i %8i %8i\n", Fpos.Element.One, Fpos.Element.Two, Fpos.Element.B);
#endif
      /////////////////////////////////////////////////////////////////////////////////////////
      // Update alignment positions
      union Positions Jpos __attribute__((aligned(16)));
      Jpos.xmm  = iom[iprf].xmm;
      PTRpos[2] = &ioi[iprf];
      
      const int NewM = _mm_extract_epi32(__Index, MATCH);
      iom[iprf].xmm  = PTRpos[NewM]->xmm;
      
      const int NewD = _mm_extract_epi32(__Index, DELETION);
      Kiod.xmm       = PTRpos[NewD]->xmm;
      
      const int NewI = _mm_extract_epi32(__Index, INSERTION);
      ioi[iprf].xmm  = PTRpos[NewI]->xmm;
      
      Kpos.xmm = Jpos.xmm;
      
      /////////////////////////////////////////////////////////////////////////////////////////
      // Update minima
      
      const int t1 = MIN(ioi[iprf].Element.One, iom[iprf].Element.One);
      const int t2 = MIN(t1, Kiod.Element.One);
      ifer->_a     = MIN(ifer->_a, t2);
      
      if (iprf > N1) {
         ifer->_b = MIN(ifer->_b, t2);   
      }
      const int t3 = MIN(ioi[iprf].Element.B, iom[iprf].Element.B);
      const int t4 = MIN(t3, Kiod.Element.B);
      ifer->_c     = MIN(ifer->_c, t4);  
   }
   
   ////////////////////////////////////////////////////////////////////////////////////////////
   // Epilogue
   ////////////////////////////////////////////////////////////////////////////////////////////
   
   // Finish updating alignment positions
   alignment->JAL1 = Fpos.Element.One;
   alignment->JAL2 = Fpos.Element.Two;
   alignment->JALB = Fpos.Element.B;
   
   // Entry and exit from protected regions
   iom[N1-1].Element.One = MIN(iom[N1-1].Element.One, In);
   iom[N1-1].Element.Two = In;
   
   for (int iprf=N1; iprf<N2; ++iprf) {
      iom[iprf].Element.One = MIN(iom[iprf].Element.One, In);
      iom[iprf].Element.Two = In;
   
      ioi[iprf].Element.One = MIN(ioi[iprf].Element.One, In);
      ioi[iprf].Element.Two = In;
   }  
   
#ifdef XALIP_DEBUG
   for (int iprf=0; iprf<=prf->Length; ++iprf) {
      printf("NEXTR IOP %4.4i %15i %15i %15i\n", iprf, iop[iprf].Element[MATCH], iop[iprf].Element[INSERTION],
             iop[iprf].Element[DELETION]);
      printf("NEXTR IOM %4.4i %15i %15i %15i\n", iprf, iom[iprf].Element.One, iom[iprf].Element.Two,
             iom[iprf].Element.B);
      printf("NEXTR IOI %4.4i %15i %15i %15i\n", iprf, ioi[iprf].Element.One, ioi[iprf].Element.Two,
             ioi[iprf].Element.B);
      
   }
   
   printf("NEXTR ALIGN %4i %4i %4i %4i %4i\n",
                   alignment->JAL1, alignment->JAL2, alignment->JALS, alignment->JALB, alignment->JALE); 
#endif
}

static void nextR_last(const struct Profile * const restrict prf, const unsigned char * const restrict Sequence,
                       const int iseq, union lScores * const restrict iop, union Positions * const restrict iom,
                       union Positions * const restrict ioi,const int lseq, struct Alignment * const restrict alignment,
                       struct Minima * const restrict ifer, const _Bool Lock, const size_t N1, const size_t N2)
{
#ifdef XALIP_DEBUG
   fprintf(stdout,"NextR_last called with iseq %i\n",iseq);
#endif
   // Initialization
   const unsigned int In = iseq + 1;
   // WARNING: Fortran uses a 1 based index for sequence
   const unsigned int SequenceIndex = (unsigned int) Sequence[iseq-1];
   
   if ( iseq < lseq) {
      fputs("nextR should have been called\n", stderr);
      exit(1);   
   } 
   
   // Disable match and insert vertices of protected region
   if (Lock) {
      iop[N1-1].Element[MATCH] = NLOW;
      for (int iprf=N1; iprf<N2; ++iprf) {
         iop[iprf].Element[MATCH]     = NLOW;
         iop[iprf].Element[INSERTION] = NLOW;
      } 
   }
   ////////////////////////////////////////////////////////////////////////////////////////////
   // Profile position 0
   ////////////////////////////////////////////////////////////////////////////////////////////
   
   // Save previous match position
   int Kopm = iop[0].Element[MATCH];
   union Positions Kpos __attribute__((aligned(16)));
   Kpos.xmm = iom[0].xmm;
   
   const union Positions TEMPpos __attribute__((aligned(16))) = { lseq+1, 0, In, 0 };
   
   const union Positions * restrict PTRpos[4];
   union Positions Kiod;
   PTRpos[0] = &TEMPpos;
   PTRpos[1] = &Kpos;
   PTRpos[3] = &Kiod;
     
   // Get pointers to score data
   const TransitionScores * const restrict Transitions = prf->Scores.Insertion.Transitions;
   const short int * restrict Insertion = prf->Scores.Insertion.Alphabet;
   const size_t AlignStep = prf->Scores.Insertion.AlignStep;
   
   int Ki = iop[0].Element[INSERTION] + (int) Insertion[SequenceIndex];
   
   // Match position
   int Ji   = Ki + (int) Transitions[0].From[INSERTION].To[MATCH];
   int itmp = (int) Transitions[0].From[EXTRA].To[MATCH];
   if ( Ji > itmp) {
      iop[0].Element[MATCH] = Ji;
      iom[0].xmm = ioi[0].xmm;
   } else {
      iop[0].Element[MATCH] = itmp;
      iom[0].xmm = TEMPpos.xmm;
   }
   
   // Deletion position
   int Kopd;
   Ji   = Ki + (int) Transitions[0].From[INSERTION].To[DELETION];
   itmp = (int) Transitions[0].From[EXTRA].To[DELETION];
   if ( Ji > itmp ) {
      Kopd     = Ji;
      Kiod.xmm = ioi[0].xmm;
   } else {
      Kopd     = itmp;
      Kiod.xmm = TEMPpos.xmm; 
   } 

   // Insertion position
   Ji   = Ki + (int) Transitions[0].From[INSERTION].To[INSERTION];
   itmp = (int) Transitions[0].From[EXTRA].To[INSERTION];
   if ( Ji > itmp ) {
      iop[0].Element[INSERTION] = Ji;
   } else {
      iop[0].Element[INSERTION] = itmp;
      ioi[0].xmm = TEMPpos.xmm;
   }
   
   // Initialize minima
   ifer->_a = iseq;
   ifer->_b = iseq;
   itmp     = MIN(ioi[0].Element.B, iom[0].Element.B);
   ifer->_c = MIN(itmp, Kiod.Element.B); 
   
   // Initialize alignment
   union Positions Fpos __attribute__((aligned(16)));
   Fpos.Element.One   = alignment->JAL1;
   Fpos.Element.Two   = alignment->JAL2;
   Fpos.Element.B     = alignment->JALB;
   Fpos.Element.dummy = 0;
   
   ////////////////////////////////////////////////////////////////////////////////////////////
   // Loop through rest of profile
   ////////////////////////////////////////////////////////////////////////////////////////////
   const short int * restrict Match = prf->Scores.Match.Alphabet;
   const ScoreTuple * const restrict LastProteinSequence = prf->Scores.Insertion.LastSequenceProtein;
   Insertion += AlignStep;
   
   for (int iprf=1; iprf<=prf->Length; ++iprf) {
      /////////////////////////////////////////////////////////////////////////////////////////
      // Match
      const register int KM = Kopm + (int) Match[SequenceIndex];
      Kopm = iop[iprf].Element[MATCH];
      
      __m128i __KM = _mm_set1_epi32(KM);
      // Load Transitions
      __m128i __TransitionsM = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[MATCH]));
      // Convert signed WORD into signed DWORD
      __TransitionsM = _mm_cvtepi16_epi32(__TransitionsM);
      // Insert LastProteinSequence
      __TransitionsM = _mm_insert_epi32(__TransitionsM, (int) LastProteinSequence[iprf].From[MATCH], DUMMY);
      // Add KM to Transitions
      __TransitionsM = _mm_add_epi32(__TransitionsM, __KM);
      

      /////////////////////////////////////////////////////////////////////////////////////////
      // Insertion
      const register int KI = iop[iprf].Element[INSERTION] + (int) Insertion[SequenceIndex]; 
      // one could move on the seq index instead
      
      __m128i __KI = _mm_set1_epi32(KI);
      // Load Transitions
      __m128i __TransitionsI = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[INSERTION]));
      // Convert signed WORD into signed DWORD
      __TransitionsI = _mm_cvtepi16_epi32(__TransitionsI);
      // Insert LastProteinSequence
      __TransitionsI = _mm_insert_epi32(__TransitionsI, (int) LastProteinSequence[iprf].From[INSERTION], DUMMY);
      // Add KM to Transitions
      __TransitionsI = _mm_add_epi32(__TransitionsI, __KI);
      
      /////////////////////////////////////////////////////////////////////////////////////////
      // Deletion
      const register int KD = Kopd + (int) Match[_D];   
      
      __m128i __KD = _mm_set1_epi32(KD);
      // Load Transitions
      __m128i __TransitionsD = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[DELETION]));
      // Convert signed WORD into signed DWORD
      __TransitionsD = _mm_cvtepi16_epi32(__TransitionsD);
      // Insert LastProteinSequence
      __TransitionsD = _mm_insert_epi32(__TransitionsD, (int) LastProteinSequence[iprf].From[DELETION], DUMMY);
      // Add KM to Transitions
      __TransitionsD = _mm_add_epi32(__TransitionsD, __KD);
      
      /////////////////////////////////////////////////////////////////////////////////////////
      // Extensions
      // Load Transitions
      __m128i __TransitionsX = _mm_loadl_epi64((__m128i*) &(Transitions[iprf].From[EXTRA]));
      // Convert signed WORD into signed DWORD
      __TransitionsX = _mm_cvtepi16_epi32(__TransitionsX);
      
      // Insert NLOW into Extension vector -> done in io.c
      //__TransitionsX = _mm_insert_epi32(__TransitionsX, NLOW, DUMMY);
      
      // Move to next profile index
      Match     += AlignStep;
      Insertion += AlignStep;

      /////////////////////////////////////////////////////////////////////////////////////////
      // Compute Maxima (Fortran Nstep function)
      __m128i __Index = (__m128i) _mm_setzero_ps();
      __m128i __three = _mm_set_epi32(3,3,3,3);
      
      __m128i  __Mask = _mm_cmpgt_epi32(__TransitionsD, __TransitionsX);
      __TransitionsX  = _mm_blendv_epi8(__TransitionsX, __TransitionsD, __Mask);
      __Index         = _mm_blendv_epi8(__Index, __three, __Mask);
      
      __m128i __One  = _mm_set_epi32(1,1,1,1);
      __Mask         = _mm_cmpgt_epi32(__TransitionsM, __TransitionsX);
      __TransitionsX = _mm_blendv_epi8(__TransitionsX, __TransitionsM, __Mask);
      __Index        = _mm_blendv_epi8(__Index, __One, __Mask);
      
      __m128i __Two  = _mm_add_epi32(__One, __One);
      __Mask         = _mm_cmpgt_epi32(__TransitionsI, __TransitionsX);
      __TransitionsX = _mm_blendv_epi8(__TransitionsX, __TransitionsI, __Mask);
      __Index        = _mm_blendv_epi8(__Index, __Two, __Mask);
      
      // Set new data
      iop[iprf].xmm = __TransitionsX;
      //_mm_storel_pi((__m64*) &iop[iprf],(__m128) __TransitionsX);
      Kopd = _mm_extract_epi32(__TransitionsX, DELETION);
      
      /////////////////////////////////////////////////////////////////////////////////////////
      // Check for new maxima
      const int KE = _mm_extract_epi32(__TransitionsX, DUMMY);
      if (KE > alignment->JALS) {
         alignment->JALS = KE;
         alignment->JALE = iseq;
         Fpos.xmm = ioi[iprf].xmm;
         const unsigned int Id = (unsigned int) _mm_extract_epi32(__Index, DUMMY);
         if (Id == 1)  { // KM is max
               Fpos.xmm = Kpos.xmm;
         } else if (Id == 3) { // KD is max
               Fpos.xmm = Kiod.xmm;
         }
      }
#ifdef XALIP_DEBUG
      printf("FPOS %8i %8i %8i\n", Fpos.Element.One, Fpos.Element.Two, Fpos.Element.B);
#endif
      /////////////////////////////////////////////////////////////////////////////////////////
      // Update alignment positions
      union Positions Jpos __attribute__((aligned(16))); 
      Jpos.xmm  = iom[iprf].xmm;
      PTRpos[2] = &ioi[iprf];
      
      const int NewM = _mm_extract_epi32(__Index, MATCH);
      iom[iprf].xmm  = PTRpos[NewM]->xmm;
      
      const int NewD = _mm_extract_epi32(__Index, DELETION);
      Kiod.xmm       = PTRpos[NewD]->xmm;
      
      const int NewI = _mm_extract_epi32(__Index, INSERTION);
      ioi[iprf].xmm  = PTRpos[NewI]->xmm;
      
      Kpos.xmm = Jpos.xmm;
      
      /////////////////////////////////////////////////////////////////////////////////////////
      // Update minima
      
      const int t1 = MIN(ioi[iprf].Element.One, iom[iprf].Element.One);
      const int t2 = MIN(t1, Kiod.Element.One);
      ifer->_a     = MIN(ifer->_a, t2);
      
      if (iprf > N1) {
         ifer->_b = MIN(ifer->_b, t2);   
      }
      const int t3 = MIN(ioi[iprf].Element.B, iom[iprf].Element.B);
      const int t4 = MIN(t3, Kiod.Element.B);
      ifer->_c     = MIN(ifer->_c, t4);  
   }
   
   ////////////////////////////////////////////////////////////////////////////////////////////
   // Epilogue
   ////////////////////////////////////////////////////////////////////////////////////////////
   
   // Finish updating alignment positions
   alignment->JAL1 = Fpos.Element.One;
   alignment->JAL2 = Fpos.Element.Two;
   alignment->JALB = Fpos.Element.B;
   
   // Entry and exit from protected regions
   iom[N1-1].Element.One = MIN(iom[N1-1].Element.One, In);
   iom[N1-1].Element.Two = In;
   
   for (int iprf=N1; iprf<N2; ++iprf) {
      iom[iprf].Element.One = MIN(iom[iprf].Element.One, In);
      iom[iprf].Element.Two = In;
   
      ioi[iprf].Element.One = MIN(ioi[iprf].Element.One, In);
      ioi[iprf].Element.Two = In;
   }  
   
#ifdef XALIP_DEBUG
   for (int iprf=0; iprf<=prf->Length; ++iprf) {
      printf("NEXTR IOP %4.4i %15i %15i %15i\n", iprf, iop[iprf].Element[MATCH], iop[iprf].Element[INSERTION],
             iop[iprf].Element[DELETION]);
      printf("NEXTR IOM %4.4i %15i %15i %15i\n", iprf, iom[iprf].Element.One, iom[iprf].Element.Two,
             iom[iprf].Element.B);
      printf("NEXTR IOI %4.4i %15i %15i %15i\n", iprf, ioi[iprf].Element.One, ioi[iprf].Element.Two,
             ioi[iprf].Element.B);
      
   }
   
   printf("NEXTR ALIGN %4i %4i %4i %4i %4i\n",
                   alignment->JAL1, alignment->JAL2, alignment->JALS, alignment->JALB, alignment->JALE);
#endif
}

int xalip_sse41( const struct Profile * const restrict prf, const unsigned char * const restrict Sequence,
           union lScores * const restrict iop, union Positions * const restrict iom,
           union Positions * const restrict ioi, const size_t bseq, const size_t lseq,
           struct Alignment * const restrict alignment,
           _Bool * const restrict Lock, const size_t N1, const size_t N2, const _Bool Lopt,
           const int kcut, const size_t NALI)
{
   int iseq;
   ////////////////////////////////////////////////////////////////////////////////////////////
   // Prelogue
   ////////////////////////////////////////////////////////////////////////////////////////////
   
   // Alignment list
   int iopt = NLOW;
   size_t nali = 0;
   
   // Search control fields
   int ibeg    = bseq-1;
   size_t jlcp = prf->Length / 2;
   int nsca    = 0;
   
   // Stack Memory
   struct Minima ifer __attribute__((aligned(16)));
   
   ////////////////////////////////////////////////////////////////////////////////////////////
   // Two step forward one step backward loop
   ////////////////////////////////////////////////////////////////////////////////////////////
   MajorLoop:
   
   iseq = ibeg;
   struct Alignment lAlignment;
   lAlignment.JALS = NLOW;
   lAlignment.JAL1 = 0;
   lAlignment.JAL2 = 0;
   lAlignment.JALB = 0;
   lAlignment.JALE = 0;
   
   // Initiate work array
   InitR(iseq, N1, N2, bseq, lseq, iop, iom, ioi, prf);
#ifdef XALIP_DEBUG
   for (size_t i=0; i<=prf->Length; ++i) {
      printf("IOP %8i %8i %8i %8i\n",
             iop[i].Element[MATCH], iop[i].Element[INSERTION], iop[i].Element[DELETION],
             iop[i].Element.dummy);
      printf("IOM %8i %8i %8i %8i\n",
             iom[i].Element.One, iom[i].Element.Two, iom[i].Element.B,
             iom[i].Element.dummy);
      printf("IOI %8i %8i %8i %8i\n",
             ioi[i].Element.One, ioi[i].Element.Two, ioi[i].Element.B,
             ioi[i].Element.dummy);          
   }
#endif
   
   // Initiate search control values
   int ilcp = iseq;
   int ifcp = iseq+1;
   int nlcp = ilcp + jlcp;
   
   // Move one sequence position forward
   ++iseq;
   
   ////////////////////////////////////////////////////////////////////////////////////////////
   // Loop over sequence positions
   ////////////////////////////////////////////////////////////////////////////////////////////
   SeqPosLoop:
   {
      ++nsca;
      /////////////////////////////////////////////////////////////////////////////////////////
      // Update work array
      if (iseq < lseq ) {
      #pragma noinline
         nextR(prf, Sequence, iseq, iop, iom, ioi, lseq, &lAlignment,&ifer, Lock[iseq], N1, N2);
      } else {
      #pragma noinline
         nextR_last(prf, Sequence, iseq, iop, iom, ioi, lseq, &lAlignment,&ifer, Lock[iseq], N1, N2);      
      }
      /////////////////////////////////////////////////////////////////////////////////////////
      // If Match found
      if (lAlignment.JALS >= kcut) {
         // Determine firdst entry of current row
         if ( (ifer._a > lAlignment.JAL2) || (iseq == lseq) ) {
            // Fill in missing alignment data
            lAlignment.JAL1 = lAlignment.JAL1 == 0 ? lAlignment.JALB : lAlignment.JAL1;
            lAlignment.JAL2 = lAlignment.JAL2 == 0 ? iseq            : lAlignment.JAL2;
            
            // Check for errors
            if (lAlignment.JAL2 < lAlignment.JAL1) {
               fputs("Error: Illegal alignment found - no list produced.\n", stderr);
               return -1;
            }
            
            if (++nali > NALI) {
               fputs("Warning: Too many alignments found - list may be incomplete.\n", stderr);
               return -2;
            } 
            
            // Accept alignment
            struct Alignment * pAlignment = &alignment[nali];
            pAlignment->JALS = lAlignment.JALS;
            pAlignment->JALB = lAlignment.JALB;
            pAlignment->JAL1 = lAlignment.JAL1;
            pAlignment->JAL2 = lAlignment.JAL2;
            pAlignment->JALE = lAlignment.JALE;
#ifdef XALIP_DEBUG
            printf("XALIP ALIGN %lu %4.4i %4.4i %4.4i %4.4i %4.4i\n",nali,
                   lAlignment.JAL1, lAlignment.JAL2, lAlignment.JALS, lAlignment.JALB, lAlignment.JALE);
#endif   
            // Protect sequence region
            for (int jseq=lAlignment.JAL1; jseq<=lAlignment.JAL2; ++jseq) {
               Lock[jseq] = true;
            } 
            
            // Exit if only searching for optimal alignment
            if (nali>0 && Lopt) 
               return nali;
            else 
               goto MajorLoop;
         } else {
            if ( ++iseq <= lseq ) goto SeqPosLoop;
         }
      } else {
         // Have we reached next check point ?
         if (iseq >= nlcp) {
            // Determine firdst entry of current row
            if (ifer._b >= ilcp) { 
               ibeg = ifcp - 1;
               ifcp = ifer._c;
               ilcp = iseq;
            }
            
            // Calculate next check point
            nlcp += jlcp;
         }
         
         // Move one sequence position forward
         if ( ++iseq <= lseq ) goto SeqPosLoop;
      }
   }
   return nali;
}

