// // $Id: sphinxsearch.cpp 4664 2014-04-18 18:08:22Z tomat $ // // // Copyright (c) 2001-2014, Andrew Aksyonoff // Copyright (c) 2008-2014, Sphinx Technologies Inc // All rights reserved // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License. You should have // received a copy of the GPL license along with this program; if you // did not, you can find it at http://www.gnu.org/ // #include "sphinx.h" #include "sphinxsearch.h" #include "sphinxquery.h" #include "sphinxint.h" #include ////////////////////////////////////////////////////////////////////////// /// costs for max_predicted_time estimations, in nanoseconds /// YMMV, defaults were estimated in a very specific environment, and then rounded off int g_iPredictorCostDoc = 64; int g_iPredictorCostHit = 48; int g_iPredictorCostSkip = 2048; int g_iPredictorCostMatch = 64; ////////////////////////////////////////////////////////////////////////// // EXTENDED MATCHING V2 ////////////////////////////////////////////////////////////////////////// #define SPH_TREE_DUMP 0 #define SPH_BM25_K1 1.2f #define SPH_BM25_SCALE 1000 struct QwordsHash_fn { static inline int Hash ( const CSphString & sKey ) { return sphCRC32 ( (const BYTE *)sKey.cstr() ); } }; void ISphQword::CollectHitMask() { if ( m_bAllFieldsKnown ) return; SeekHitlist ( m_iHitlistPos ); for ( Hitpos_t uHit = GetNextHit(); uHit!=EMPTY_HIT; uHit = GetNextHit() ) m_dQwordFields.Set ( HITMAN::GetField ( uHit ) ); m_bAllFieldsKnown = true; } /// match in the stream struct ExtDoc_t { SphDocID_t m_uDocid; CSphRowitem * m_pDocinfo; ///< for inline storage only SphOffset_t m_uHitlistOffset; DWORD m_uDocFields; float m_fTFIDF; }; /// word in the query struct ExtQword_t { CSphString m_sWord; ///< word CSphString m_sDictWord; ///< word as processed by dict int m_iDocs; ///< matching documents int m_iHits; ///< matching hits float m_fIDF; ///< IDF value int m_iQueryPos; ///< position in the query bool m_bExpanded; ///< added by prefix expansion bool m_bExcluded; ///< excluded by the query (eg. bb in (aa AND NOT bb)) }; /// query words set typedef CSphOrderedHash < ExtQword_t, CSphString, QwordsHash_fn, 256 > ExtQwordsHash_t; /// per-document zone information (span start/end positions) struct ZoneInfo_t { CSphVector m_dStarts; CSphVector m_dEnds; }; /// zone hash key, zoneid+docid struct ZoneKey_t { int m_iZone; SphDocID_t m_uDocid; explicit ZoneKey_t ( int iZone=0, SphDocID_t uDocid=0 ) : m_iZone ( iZone ) , m_uDocid ( uDocid ) {} bool operator == ( const ZoneKey_t & rhs ) const { return m_iZone==rhs.m_iZone && m_uDocid==rhs.m_uDocid; } }; /// zone hashing function struct ZoneHash_fn { static inline int Hash ( const ZoneKey_t & tKey ) { return (DWORD)tKey.m_uDocid ^ ( tKey.m_iZone<<16 ); } }; /// zone hash typedef CSphOrderedHash < ZoneInfo_t, ZoneKey_t, ZoneHash_fn, 4096 > ZoneHash_c; /// zonespan prototype typedef CSphOrderedHash < int, ZoneKey_t, ZoneHash_fn, 8192*1024 > ZoneSpans_c; /// generic match streamer class ExtNode_i { public: ExtNode_i (); virtual ~ExtNode_i () { SafeDeleteArray ( m_pDocinfo ); } static ExtNode_i * Create ( const XQNode_t * pNode, const ISphQwordSetup & tSetup ); static ExtNode_i * Create ( const XQKeyword_t & tWord, const XQNode_t * pNode, const ISphQwordSetup & tSetup ); static ExtNode_i * Create ( ISphQword * pQword, const XQNode_t * pNode, const ISphQwordSetup & tSetup ); virtual void Reset ( const ISphQwordSetup & tSetup ) = 0; virtual void HintDocid ( SphDocID_t uMinID ) = 0; virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ) = 0; virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ) = 0; virtual int GetQwords ( ExtQwordsHash_t & hQwords ) = 0; virtual void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) = 0; virtual void GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ) = 0; virtual bool GotHitless () = 0; virtual int GetDocsCount () { return INT_MAX; } virtual uint64_t GetWordID () const = 0; ///< for now, only used for duplicate keyword checks in quorum operator void DebugIndent ( int iLevel ) { while ( iLevel-- ) printf ( " " ); } virtual void DebugDump ( int iLevel ) { DebugIndent ( iLevel ); printf ( "ExtNode\n" ); } // return specific extra data may be associated with the node // intended to be used a bit similar to QueryInterface() in COM technology // but simpler due to enum instead of 128-bit GUID, and no ref. counting inline bool GetExtraData ( ExtraData_e eNode, void** ppData ) { return ExtraDataImpl ( eNode, ppData ); } private: virtual bool ExtraDataImpl ( ExtraData_e, void** ) { return false; } public: static const int MAX_DOCS = 512; static const int MAX_HITS = 512; int m_iAtomPos; ///< we now need it on this level for tricks like expanded keywords within phrases protected: ExtDoc_t m_dDocs[MAX_DOCS]; ExtHit_t m_dHits[MAX_HITS]; public: SphDocID_t m_uMaxID; int m_iStride; ///< docinfo stride (for inline mode only) protected: CSphRowitem * m_pDocinfo; ///< docinfo storage (for inline mode only) void AllocDocinfo ( const ISphQwordSetup & tSetup ) { if ( tSetup.m_iInlineRowitems ) { m_iStride = tSetup.m_iInlineRowitems; m_pDocinfo = new CSphRowitem [ MAX_DOCS*m_iStride ]; } } protected: inline const ExtDoc_t * ReturnDocsChunk ( int iCount, SphDocID_t * pMaxID ) { assert ( iCount>=0 && iCount=0 ); assert ( iPos & dTermDupes ); virtual bool GotHitless () { return false; } virtual int GetDocsCount () { return m_pQword->m_iDocs; } virtual uint64_t GetWordID () const { if ( m_pQword->m_iWordID ) return m_pQword->m_iWordID; else return sphFNV64 ( (const BYTE *)m_pQword->m_sDictWord.cstr() ); } virtual void HintDocid ( SphDocID_t uMinID ) { m_pQword->HintDocid ( uMinID ); if ( m_pStats ) m_pStats->m_iSkips++; if ( m_pNanoBudget ) *m_pNanoBudget -= g_iPredictorCostSkip; } virtual void DebugDump ( int iLevel ) { DebugIndent ( iLevel ); printf ( "ExtTerm: %s at: %d ", m_pQword->m_sWord.cstr(), m_pQword->m_iAtomPos ); if ( m_dQueriedFields.TestAll(true) ) { printf ( "(all)\n" ); } else { bool bFirst = true; printf ( "in: " ); for ( int iField = 0; iField < CSphSmallBitvec::iTOTALBITS; iField++ ) { if ( m_dQueriedFields.Test ( iField ) ) { if ( !bFirst ) printf ( ", " ); printf ( "%d", iField ); bFirst = false; } } printf ( "\n" ); } } protected: ISphQword * m_pQword; CSphSmallBitvec m_dQueriedFields; ///< accepted fields mask bool m_bHasWideFields; ///< whether fields mask for this term refer to fields 32+ float m_fIDF; ///< IDF for this term (might be 0.0f for non-1st occurences in query) int64_t m_iMaxTimer; ///< work until this timestamp CSphString * m_pWarning; bool m_bNotWeighted; CSphQueryStats * m_pStats; int64_t * m_pNanoBudget; ExtDoc_t * m_pLastChecked; ///< points to entry in m_dDocs which GetHitsChunk() currently emits hits for SphDocID_t m_uMatchChecked; ///< there are no more hits for matches block starting with this ID bool m_bTail; ///< should we emit more hits for current docid or proceed furthwer public: static volatile bool m_bInterruptNow; ///< may be set from outside to indicate the globally received sigterm }; /// Immediately interrupt current operation void sphInterruptNow() { ExtTerm_c::m_bInterruptNow = true; } bool sphInterrupted() { return ExtTerm_c::m_bInterruptNow; } volatile bool ExtTerm_c::m_bInterruptNow = false; /// single keyword streamer with artificial hitlist class ExtTermHitless_c : public ExtTerm_c { public: ExtTermHitless_c ( ISphQword * pQword, const CSphSmallBitvec& uFields, const ISphQwordSetup & tSetup, bool bNotWeighted ) : ExtTerm_c ( pQword, uFields, tSetup, bNotWeighted ) , m_uFieldPos ( 0 ) {} virtual void Reset ( const ISphQwordSetup & ) { m_uFieldPos = 0; } virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); virtual bool GotHitless () { return true; } protected: DWORD m_uFieldPos; }; ////////////////////////////////////////////////////////////////////////// class HitPointer { protected: inline void SetMyHit ( int, bool = false ) {} inline void CopyMyHit ( int, int ) {} }; /// position filter policy template < TermPosFilter_e T > class TermAcceptor_c : protected HitPointer { public: TermAcceptor_c ( ISphQword *, const XQNode_t *, const ISphQwordSetup & ) {} protected: inline bool IsAcceptableHit ( const ExtHit_t * ) const { return true; } inline void Reset() {} }; template<> class TermAcceptor_c : protected HitPointer { public: TermAcceptor_c ( ISphQword *, const XQNode_t * pNode, const ISphQwordSetup & ) : m_iMaxFieldPos ( pNode->m_dSpec.m_iFieldMaxPos ) {} protected: inline bool IsAcceptableHit ( const ExtHit_t * ) const; inline void Reset() { m_iMaxFieldPos = 0; } private: int m_iMaxFieldPos; }; template<> class TermAcceptor_c : protected HitPointer { public: TermAcceptor_c ( ISphQword *, const XQNode_t * pNode, const ISphQwordSetup & tSetup ) : m_pZoneChecker ( tSetup.m_pZoneChecker ) , m_dZones ( pNode->m_dSpec.m_dZones ) , m_uLastZonedId ( 0 ) , m_iCheckFrom ( 0 ) {} protected: inline bool IsAcceptableHit ( const ExtHit_t * pHit ) const; inline void Reset() { m_uLastZonedId = 0; m_iCheckFrom = 0; } protected: ISphZoneCheck * m_pZoneChecker; ///< zone-limited searches query ranker about zones mutable CSphVector m_dZones; ///< zone ids for this particular term mutable SphDocID_t m_uLastZonedId; mutable int m_iCheckFrom; }; template<> class TermAcceptor_c : public TermAcceptor_c { public: TermAcceptor_c ( ISphQword * pWord, const XQNode_t * pNode, const ISphQwordSetup & tSetup ) : TermAcceptor_c ( pWord, pNode, tSetup) , m_iMyHit ( -1 ) , m_bFinal ( false ) , m_dMyZones ( pNode->m_dSpec.m_dZones.GetLength() ) , m_dFinalZones ( pNode->m_dSpec.m_dZones.GetLength() ) {} protected: inline bool IsAcceptableHit ( const ExtHit_t * pHit ) const; inline void Reset() { TermAcceptor_c::Reset(); m_dMyZones.ResetZones(); m_dFinalZones.ResetZones(); } inline void SetMyHit ( int iHit, bool bFinal = false ) { m_iMyHit = iHit; m_bFinal = bFinal;} inline void CopyMyHit ( int iSrc, int iDst ) { m_dMyZones.CopyZVecTo ( iSrc, m_dFinalZones.GetZVec ( iDst ) ); } private: int m_iMyHit; ///< the current num of hit in internal buffer bool m_bFinal; ///< whether we point to our buffer, or temporary one for filtered zones protected: ZoneSpansHolder m_dMyZones; ///< extra buffer for filtered zones ZoneSpansHolder m_dFinalZones; ///< the actual buffer of the linked node }; template<> class TermAcceptor_c : public TermAcceptor_c { public: TermAcceptor_c ( ISphQword * pWord, const XQNode_t * pNode, const ISphQwordSetup & tSetup ) : TermAcceptor_c ( pWord, pNode, tSetup) {} protected: inline bool IsAcceptableHit ( const ExtHit_t * ) const { return true; } }; /// class BufferedNode_c { protected: BufferedNode_c () : m_uTermMaxID ( 0 ) , m_pRawDocs ( NULL ) , m_pRawDoc ( NULL ) , m_pRawHit ( NULL ) , m_uLastID ( 0 ) , m_eState ( COPY_DONE ) , m_uDoneFor ( 0 ) { m_dMyDocs[0].m_uDocid = DOCID_MAX; m_dMyHits[0].m_uDocid = DOCID_MAX; m_dFilteredHits[0].m_uDocid = DOCID_MAX; } void Reset () { m_uTermMaxID = 0; m_pRawDocs = NULL; m_pRawDoc = NULL; m_pRawHit = NULL; m_uLastID = 0; m_eState = COPY_DONE; m_uDoneFor = 0; m_dMyDocs[0].m_uDocid = DOCID_MAX; m_dMyHits[0].m_uDocid = DOCID_MAX; m_dFilteredHits[0].m_uDocid = DOCID_MAX; } protected: SphDocID_t m_uTermMaxID; const ExtDoc_t * m_pRawDocs; ///< chunk start as returned by raw GetDocsChunk() (need to store it for raw GetHitsChunk() calls) const ExtDoc_t * m_pRawDoc; ///< current position in raw docs chunk const ExtHit_t * m_pRawHit; ///< current position in raw hits chunk SphDocID_t m_uLastID; enum { COPY_FILTERED, COPY_TRAILING, COPY_DONE } m_eState; ///< internal GetHitsChunk() state (are we copying from my hits, or passing trailing raw hits, or done) ExtDoc_t m_dMyDocs[ExtNode_i::MAX_DOCS]; ///< all documents within the required pos range ExtHit_t m_dMyHits[ExtNode_i::MAX_HITS]; ///< all hits within the required pos range ExtHit_t m_dFilteredHits[ExtNode_i::MAX_HITS]; ///< hits from requested subset of the documents (for GetHitsChunk()) SphDocID_t m_uDoneFor; }; /// single keyword streamer, with term position filtering template < TermPosFilter_e T, class ExtBase=ExtTerm_c > class ExtConditional : public BufferedNode_c, public ExtBase, protected TermAcceptor_c { typedef TermAcceptor_c t_Acceptor; protected: ExtConditional ( ISphQword * pQword, const XQNode_t * pNode, const ISphQwordSetup & tSetup ); public: virtual void Reset ( const ISphQwordSetup & tSetup ); virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ); virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); virtual bool GotHitless () { return false; } private: virtual bool ExtraDataImpl ( ExtraData_e eData, void ** ppResult ); }; /// single keyword streamer, with term position filtering template < TermPosFilter_e T > class ExtTermPos_c : public ExtConditional { public: ExtTermPos_c ( ISphQword * pQword, const XQNode_t * pNode, const ISphQwordSetup & tSetup ) : ExtConditional ( pQword, pNode, tSetup ) { ExtTerm_c::Init ( pQword, pNode->m_dSpec.m_dFieldMask, tSetup, pNode->m_bNotWeighted ); } }; template<> bool ExtConditional::ExtraDataImpl ( ExtraData_e eData, void ** ppResult ) { assert ( ppResult ); if ( eData==EXTRA_GET_DATA_ZONESPANS ) { *ppResult = &m_dFinalZones; return true; } return false; } template bool ExtConditional::ExtraDataImpl ( ExtraData_e, void ** ) { return false; } /// multi-node binary-operation streamer traits class ExtTwofer_c : public ExtNode_i { public: ExtTwofer_c ( ExtNode_i * pFirst, ExtNode_i * pSecond, const ISphQwordSetup & tSetup ); ExtTwofer_c () {} ///< to be used in pair with Init(); ~ExtTwofer_c (); void Init ( ExtNode_i * pFirst, ExtNode_i * pSecond, const ISphQwordSetup & tSetup ); virtual void Reset ( const ISphQwordSetup & tSetup ); virtual int GetQwords ( ExtQwordsHash_t & hQwords ); virtual void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ); virtual void GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ); virtual bool GotHitless () { return m_pChildren[0]->GotHitless() || m_pChildren[1]->GotHitless(); } void DebugDumpT ( const char * sName, int iLevel ) { DebugIndent ( iLevel ); printf ( "%s:\n", sName ); m_pChildren[0]->DebugDump ( iLevel+1 ); m_pChildren[1]->DebugDump ( iLevel+1 ); } void SetNodePos ( WORD uPosLeft, WORD uPosRight ) { m_dNodePos[0] = uPosLeft; m_dNodePos[1] = uPosRight; m_bPosAware = true; } virtual void HintDocid ( SphDocID_t uMinID ) { m_pChildren[0]->HintDocid ( uMinID ); m_pChildren[1]->HintDocid ( uMinID ); } virtual uint64_t GetWordID () const { return m_pChildren[0]->GetWordID() ^ m_pChildren[1]->GetWordID(); } protected: ExtNode_i * m_pChildren[2]; const ExtDoc_t * m_pCurDoc[2]; const ExtHit_t * m_pCurHit[2]; WORD m_dNodePos[2]; bool m_bPosAware; SphDocID_t m_uMatchedDocid; }; /// A-and-B streamer class ExtAnd_c : public ExtTwofer_c { public: ExtAnd_c ( ExtNode_i * pFirst, ExtNode_i * pSecond, const ISphQwordSetup & tSetup ) : ExtTwofer_c ( pFirst, pSecond, tSetup ) {} ExtAnd_c() {} ///< to be used with Init() virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ); virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); void DebugDump ( int iLevel ) { DebugDumpT ( "ExtAnd", iLevel ); } }; class ExtAndZonespanned : public ExtAnd_c { public: ExtAndZonespanned () {} ///< to be used in pair with Init() inline void Init ( ExtNode_i * pFirst, ExtNode_i * pSecond, const ISphQwordSetup & tSetup, ZoneSpansHolder * pSpans ) { ExtAnd_c::Init ( pFirst, pSecond, tSetup ); m_pSpans = pSpans; m_pLastBaseHit[0] = NULL; m_pLastBaseHit[1] = NULL; if ( pFirst && !pFirst->GetExtraData ( EXTRA_GET_DATA_ZONESPANS, (void**) &m_dChildzones[0] ) ) assert ( false ); if ( pSecond && !pSecond->GetExtraData ( EXTRA_GET_DATA_ZONESPANS, (void**) &m_dChildzones[1] ) ) assert ( false ); } virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); void DebugDump ( int iLevel ) { DebugDumpT ( "ExtAndZonespan", iLevel ); } private: ZoneSpansHolder * m_dChildzones[2]; ZoneSpansHolder * m_pSpans; const ExtHit_t * m_pLastBaseHit[2]; private: bool IsSameZonespan ( int iHit, int iProofHit ) const; }; class ExtAndZonespan_c : public ExtConditional < TERM_POS_NONE, ExtAndZonespanned > { public: ExtAndZonespan_c ( ExtNode_i *pFirst, ExtNode_i *pSecond, const ISphQwordSetup & tSetup, const XQNode_t * pNode ) : ExtConditional ( NULL, pNode, tSetup ) { ExtAndZonespanned::Init ( pFirst, pSecond, tSetup, &m_dMyZones ); } private: bool ExtraDataImpl ( ExtraData_e eData, void ** ppResult ) { assert ( ppResult ); if ( eData==EXTRA_GET_DATA_ZONESPANS ) { *ppResult = &m_dFinalZones; return true; } return false; } }; /// A-or-B streamer class ExtOr_c : public ExtTwofer_c { public: ExtOr_c ( ExtNode_i * pFirst, ExtNode_i * pSecond, const ISphQwordSetup & tSetup ) : ExtTwofer_c ( pFirst, pSecond, tSetup ) {} virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ); virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); void DebugDump ( int iLevel ) { DebugDumpT ( "ExtOr", iLevel ); } }; /// A-and-not-B streamer class ExtAndNot_c : public ExtTwofer_c { public: ExtAndNot_c ( ExtNode_i * pFirst, ExtNode_i * pSecond, const ISphQwordSetup & tSetup ); virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ); virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); virtual void Reset ( const ISphQwordSetup & tSetup ); void DebugDump ( int iLevel ) { DebugDumpT ( "ExtAndNot", iLevel ); } protected: bool m_bPassthrough; }; /// generic operator over N nodes class ExtNWayT : public ExtNode_i { public: ExtNWayT ( const CSphVector & dNodes, const ISphQwordSetup & tSetup ); ~ExtNWayT (); virtual void Reset ( const ISphQwordSetup & tSetup ); virtual int GetQwords ( ExtQwordsHash_t & hQwords ); virtual void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ); virtual void GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ); virtual bool GotHitless () { return false; } virtual void HintDocid ( SphDocID_t uMinID ) { m_pNode->HintDocid ( uMinID ); } virtual uint64_t GetWordID () const; protected: ExtNode_i * m_pNode; ///< my and-node for all the terms const ExtDoc_t * m_pDocs; ///< current docs chunk from and-node SphDocID_t m_uDocsMaxID; ///< max id in current docs chunk const ExtHit_t * m_pHits; ///< current hits chunk from and-node const ExtDoc_t * m_pDoc; ///< current doc from and-node const ExtHit_t * m_pHit; ///< current hit from and-node const ExtDoc_t * m_pMyDoc; ///< current doc for hits getter const ExtHit_t * m_pMyHit; ///< current hit for hits getter SphDocID_t m_uLastDocID; ///< last emitted hit ExtHit_t m_dMyHits[MAX_HITS]; ///< buffer for all my phrase hits; inherited m_dHits will receive filtered results SphDocID_t m_uMatchedDocid; ///< doc currently in process SphDocID_t m_uHitsOverFor; ///< there are no more hits for matches block starting with this ID protected: inline void ConstructNode ( const CSphVector & dNodes, const CSphVector & dPositions, const ISphQwordSetup & tSetup ) { assert ( m_pNode==NULL ); WORD uLPos = dPositions[0]; ExtNode_i * pCur = dNodes[uLPos++]; // ++ for zero-based to 1-based ExtAnd_c * pCurEx = NULL; DWORD uLeaves = dNodes.GetLength(); WORD uRPos; for ( DWORD i=1; iSetNodePos ( uLPos, uRPos ); uLPos = 0; } m_pNode = pCur; } }; struct ExtNodeTF_fn { bool IsLess ( ExtNode_i * pA, ExtNode_i * pB ) const { return pA->GetDocsCount() < pB->GetDocsCount(); } }; struct ExtNodeTFExt_fn { const CSphVector & m_dNodes; explicit ExtNodeTFExt_fn ( const CSphVector & dNodes ) : m_dNodes ( dNodes ) {} ExtNodeTFExt_fn ( const ExtNodeTFExt_fn & rhs ) : m_dNodes ( rhs.m_dNodes ) {} bool IsLess ( WORD uA, WORD uB ) const { return m_dNodes[uA]->GetDocsCount() < m_dNodes[uB]->GetDocsCount(); } private: const ExtNodeTFExt_fn & operator = ( const ExtNodeTFExt_fn & ) { return *this; } }; template < class FSM > class ExtNWay_c : public ExtNWayT, private FSM { public: ExtNWay_c ( const CSphVector & dNodes, const XQNode_t & tNode, const ISphQwordSetup & tSetup ) : ExtNWayT ( dNodes, tSetup ) , FSM ( dNodes, tNode, tSetup ) { bool bTerms = FSM::bTermsTree; // workaround MSVC const condition warning CSphVector dPositions ( dNodes.GetLength() ); ARRAY_FOREACH ( i, dPositions ) dPositions[i] = (WORD) i; if ( bTerms ) dPositions.Sort ( ExtNodeTFExt_fn ( dNodes ) ); ConstructNode ( dNodes, dPositions, tSetup ); } public: virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ); virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); virtual void DebugDump ( int iLevel ) { DebugIndent ( iLevel ); printf ( "%s\n", FSM::GetName() ); m_pNode->DebugDump ( iLevel+1 ); } private: bool EmitTail ( int & iHit ); ///< the "trickiest part" extracted in order to process the proximity also }; class FSMphrase { protected: struct State_t { int m_iTagQword; int m_iExpHitpos; }; protected: static const bool bTermsTree = true; ///< we work with ExtTerm nodes FSMphrase ( const CSphVector & dQwords, const XQNode_t & tNode, const ISphQwordSetup & tSetup ); bool HitFSM ( const ExtHit_t* pHit, ExtHit_t* dTarget ); inline static const char * GetName() { return "ExtPhrase"; } inline void ResetFSM() { m_dStates.Resize(0); } protected: CSphVector m_dQposDelta; ///< next expected qpos delta for each existing qpos (for skipped stopwords case) CSphVector m_dAtomPos; ///< lets use it as finite automata states and keep references on it CSphVector m_dStates; ///< pointers to states of finite automata }; /// exact phrase streamer typedef ExtNWay_c < FSMphrase > ExtPhrase_c; /// proximity streamer class FSMproximity { protected: static const bool bTermsTree = true; ///< we work with ExtTerm nodes FSMproximity ( const CSphVector & dQwords, const XQNode_t & tNode, const ISphQwordSetup & tSetup ); bool HitFSM ( const ExtHit_t* pHit, ExtHit_t* dTarget ); inline static const char * GetName() { return "ExtProximity"; } inline void ResetFSM() { m_uExpPos = 0; m_uWords = 0; m_iMinQindex = -1; ARRAY_FOREACH ( i, m_dProx ) m_dProx[i] = UINT_MAX; } protected: int m_iMaxDistance; DWORD m_uWordsExpected; DWORD m_uMinQpos; DWORD m_uQLen; DWORD m_uExpPos; CSphVector m_dProx; // proximity hit position for i-th word CSphVector m_dDeltas; // used for weight calculation DWORD m_uWords; int m_iMinQindex; }; /// exact phrase streamer typedef ExtNWay_c ExtProximity_c; /// proximity streamer class FSMmultinear { protected: static const bool bTermsTree = true; ///< we work with generic (not just ExtTerm) nodes FSMmultinear ( const CSphVector & dNodes, const XQNode_t & tNode, const ISphQwordSetup & tSetup ); bool HitFSM ( const ExtHit_t * pHit, ExtHit_t * dTarget ); inline static const char * GetName() { return "ExtMultinear"; } inline void ResetFSM() { m_iRing = m_uLastP = m_uPrelastP = 0; } protected: int m_iNear; ///< the NEAR distance DWORD m_uPrelastP; DWORD m_uPrelastML; DWORD m_uPrelastSL; DWORD m_uPrelastW; DWORD m_uLastP; ///< position of the last hit DWORD m_uLastML; ///< the length of the previous hit DWORD m_uLastSL; ///< the length of the previous hit in Query DWORD m_uLastW; ///< last weight DWORD m_uWordsExpected; ///< now many hits we're expect DWORD m_uWeight; ///< weight accum DWORD m_uFirstHit; ///< hitpos of the beginning of the match chain WORD m_uFirstNpos; ///< N-position of the head of the chain WORD m_uFirstQpos; ///< Q-position of the head of the chain (for twofers) CSphVector m_dNpos; ///< query positions for multinear CSphVector m_dRing; ///< ring buffer for multihit data int m_iRing; ///< the head of the ring bool m_bTwofer; ///< if we have 2- or N-way NEAR private: inline int RingTail() const { return ( m_iRing + m_dNpos.GetLength() - 1 ) % m_uWordsExpected; } inline void Add2Ring ( const ExtHit_t* pHit ) { if ( !m_bTwofer ) m_dRing [ RingTail() ] = *pHit; } inline void ShiftRing() { if ( ++m_iRing==(int)m_uWordsExpected ) m_iRing=0; } }; /// exact phrase streamer typedef ExtNWay_c ExtMultinear_c; /// quorum streamer class ExtQuorum_c : public ExtNode_i { public: ExtQuorum_c ( CSphVector & dQwords, const XQNode_t & tNode, const ISphQwordSetup & tSetup ); virtual ~ExtQuorum_c (); virtual void Reset ( const ISphQwordSetup & tSetup ); virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ); virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); virtual int GetQwords ( ExtQwordsHash_t & hQwords ); virtual void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ); virtual void GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ); virtual uint64_t GetWordID () const; virtual bool GotHitless () { return false; } virtual void HintDocid ( SphDocID_t uMinID ) { ARRAY_FOREACH ( i, m_dChildren ) if ( m_dChildren[i].m_pTerm ) m_dChildren[i].m_pTerm->HintDocid ( uMinID ); } static int GetThreshold ( const XQNode_t & tNode, int iQwords ); private: struct TermTuple_t { ExtNode_i * m_pTerm; ///< my children nodes (simply ExtTerm_c for now, not true anymore) const ExtDoc_t * m_pCurDoc; ///< current positions into children doclists const ExtHit_t * m_pCurHit; ///< current positions into children hitlists int m_iCount; ///< terms count in case of dupes bool m_bStandStill; ///< should we emit hits to proceed further }; ExtHit_t m_dQuorumHits[MAX_HITS]; ///< buffer for all my quorum hits; inherited m_dHits will receive filtered results int m_iMyHitCount; ///< hits collected so far int m_iMyLast; ///< hits processed so far CSphVector m_dInitialChildren; ///< my children nodes (simply ExtTerm_c for now) CSphVector m_dChildren; SphDocID_t m_uMatchedDocid; ///< tail docid for hitlist emission int m_iThresh; ///< keyword count threshold // FIXME!!! also skip hits processing for children w\o constrains ( zones or field limit ) bool m_bHasDupes; ///< should we analyze hits on docs collecting // check for hits that matches and return flag that docs might be advanced bool CollectMatchingHits ( SphDocID_t uDocid, int iQuorum ); const ExtHit_t * GetHitsChunkDupes ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); const ExtHit_t * GetHitsChunkDupesTail (); const ExtHit_t * GetHitsChunkSimple ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); int CountQuorum ( bool bFixDupes ) { if ( !m_bHasDupes ) return m_dChildren.GetLength(); int iSum = 0; bool bHasDupes = false; ARRAY_FOREACH ( i, m_dChildren ) { iSum += m_dChildren[i].m_iCount; bHasDupes |= ( m_dChildren[i].m_iCount>1 ); } m_bHasDupes = bFixDupes ? bHasDupes : m_bHasDupes; return iSum; } inline const ExtHit_t * ReturnHitsChunk ( int iCount ) { assert ( iCount>=0 && iCount & dChildren, const ISphQwordSetup & tSetup ); ~ExtOrder_c (); virtual void Reset ( const ISphQwordSetup & tSetup ); virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ); virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t ); virtual int GetQwords ( ExtQwordsHash_t & hQwords ); virtual void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ); virtual void GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ); virtual bool GotHitless () { return false; } virtual uint64_t GetWordID () const; virtual void HintDocid ( SphDocID_t uMinID ) { ARRAY_FOREACH ( i, m_dChildren ) m_dChildren[i]->HintDocid ( uMinID ); } protected: CSphVector m_dChildren; CSphVector m_pDocsChunk; ///< last document chunk (for hit fetching) CSphVector m_pDocs; ///< current position in document chunk CSphVector m_pHits; ///< current position in hits chunk CSphVector m_dMaxID; ///< max DOCID from the last chunk ExtHit_t m_dMyHits[MAX_HITS]; ///< buffer for all my phrase hits; inherited m_dHits will receive filtered results bool m_bDone; SphDocID_t m_uHitsOverFor; protected: int GetNextHit ( SphDocID_t uDocid ); ///< get next hit within given document, and return its child-id int GetMatchingHits ( SphDocID_t uDocid, ExtHit_t * pHitbuf, int iLimit ); ///< process candidate hits and stores actual matches while we can }; /// same-text-unit streamer /// (aka, A and B within same sentence, or same paragraph) class ExtUnit_c : public ExtNode_i { public: ExtUnit_c ( ExtNode_i * pFirst, ExtNode_i * pSecond, const CSphSmallBitvec& dFields, const ISphQwordSetup & tSetup, const char * sUnit ); ~ExtUnit_c (); virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ); virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ); virtual void Reset ( const ISphQwordSetup & tSetup ); virtual int GetQwords ( ExtQwordsHash_t & hQwords ); virtual void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ); virtual void GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ); virtual uint64_t GetWordID () const; public: virtual bool GotHitless () { return false; } virtual void HintDocid ( SphDocID_t uMinID ) { m_pArg1->HintDocid ( uMinID ); m_pArg2->HintDocid ( uMinID ); } virtual void DebugDump ( int iLevel ) { DebugIndent ( iLevel ); printf ( "ExtSentence\n" ); m_pArg1->DebugDump ( iLevel+1 ); m_pArg2->DebugDump ( iLevel+1 ); } protected: inline const ExtDoc_t * ReturnDocsChunk ( int iDocs, int iMyHit, SphDocID_t * pMaxID ) { assert ( iMyHitGetQwords ( hQwords ) : -1; } virtual void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ); virtual void SetTermDupes ( const ExtQwordsHash_t & , int ) {} virtual bool InitState ( const CSphQueryContext &, CSphString & ) { return true; } public: // FIXME? hide and friend? virtual SphZoneHit_e IsInZone ( int iZone, const ExtHit_t * pHit, int * pLastSpan=0 ); virtual const CSphIndex * GetIndex() { return m_pIndex; } public: CSphMatch m_dMatches[ExtNode_i::MAX_DOCS]; ///< exposed for caller DWORD m_uPayloadMask; ///< exposed for ranker state functors int m_iQwords; ///< exposed for ranker state functors int m_iMaxQpos; ///< max in-query pos among all keywords, including dupes; for ranker state functors protected: int m_iInlineRowitems; ExtNode_i * m_pRoot; const ExtDoc_t * m_pDoclist; const ExtHit_t * m_pHitlist; SphDocID_t m_uMaxID; ExtDoc_t m_dMyDocs[ExtNode_i::MAX_DOCS]; ///< my local documents pool; for filtering CSphMatch m_dMyMatches[ExtNode_i::MAX_DOCS]; ///< my local matches pool; for filtering CSphMatch m_tTestMatch; const CSphIndex * m_pIndex; ///< this is he who'll do my filtering! CSphQueryContext * m_pCtx; int64_t * m_pNanoBudget; protected: CSphVector m_dZones; CSphVector m_dZoneStartTerm; CSphVector m_dZoneEndTerm; CSphVector m_dZoneStart; CSphVector m_dZoneEnd; CSphVector m_dZoneMax; ///< last docid we (tried) to cache CSphVector m_dZoneMin; ///< first docid we (tried) to cache ZoneHash_c m_hZoneInfo; bool m_bZSlist; }; STATIC_ASSERT ( ( 8*8*sizeof(DWORD) )>=SPH_MAX_FIELDS, PAYLOAD_MASK_OVERFLOW ); static const bool WITH_BM25 = true; template < bool USE_BM25 = false > class ExtRanker_WeightSum_c : public ExtRanker_c { protected: int m_iWeights; const int * m_pWeights; public: ExtRanker_WeightSum_c ( const XQQuery_t & tXQ, const ISphQwordSetup & tSetup ) : ExtRanker_c ( tXQ, tSetup ) {} virtual int GetMatches (); virtual bool InitState ( const CSphQueryContext & tCtx, CSphString & ) { m_iWeights = tCtx.m_iWeights; m_pWeights = tCtx.m_dWeights; return true; } }; class ExtRanker_None_c : public ExtRanker_c { public: ExtRanker_None_c ( const XQQuery_t & tXQ, const ISphQwordSetup & tSetup ) : ExtRanker_c ( tXQ, tSetup ) {} virtual int GetMatches (); }; template < typename STATE > class ExtRanker_T : public ExtRanker_c { protected: STATE m_tState; ZoneSpansHolder * m_pZones; const ExtHit_t * m_pHitBase; CSphVector m_dZonespans; // zonespanlists for my matches public: ExtRanker_T ( const XQQuery_t & tXQ, const ISphQwordSetup & tSetup ); virtual int GetMatches (); virtual bool InitState ( const CSphQueryContext & tCtx, CSphString & sError ) { return m_tState.Init ( tCtx.m_iWeights, &tCtx.m_dWeights[0], this, sError ); } private: virtual bool ExtraDataImpl ( ExtraData_e eType, void ** ppResult ) { switch ( eType ) { case EXTRA_GET_DATA_ZONESPANS: assert ( ppResult ); *ppResult = &m_dZonespans; return true; default: return m_tState.ExtraData ( eType, ppResult ); } } }; ////////////////////////////////////////////////////////////////////////// static inline void CopyExtDocinfo ( ExtDoc_t & tDst, const ExtDoc_t & tSrc, CSphRowitem ** ppRow, int iStride ) { if ( tSrc.m_pDocinfo ) { assert ( ppRow && *ppRow ); memcpy ( *ppRow, tSrc.m_pDocinfo, iStride*sizeof(CSphRowitem) ); tDst.m_pDocinfo = *ppRow; *ppRow += iStride; } else tDst.m_pDocinfo = NULL; } static inline void CopyExtDoc ( ExtDoc_t & tDst, const ExtDoc_t & tSrc, CSphRowitem ** ppRow, int iStride ) { tDst = tSrc; CopyExtDocinfo ( tDst, tSrc, ppRow, iStride ); } ExtNode_i::ExtNode_i () : m_iAtomPos(0) , m_uMaxID(0) , m_iStride(0) , m_pDocinfo(NULL) { m_dDocs[0].m_uDocid = DOCID_MAX; m_dHits[0].m_uDocid = DOCID_MAX; } static ISphQword * CreateQueryWord ( const XQKeyword_t & tWord, const ISphQwordSetup & tSetup, CSphDict * pZonesDict=NULL ) { BYTE sTmp [ 3*SPH_MAX_WORD_LEN + 16 ]; strncpy ( (char*)sTmp, tWord.m_sWord.cstr(), sizeof(sTmp) ); sTmp[sizeof(sTmp)-1] = '\0'; ISphQword * pWord = tSetup.QwordSpawn ( tWord ); pWord->m_sWord = tWord.m_sWord; CSphDict * pDict = pZonesDict ? pZonesDict : tSetup.m_pDict; pWord->m_iWordID = tWord.m_bMorphed ? pDict->GetWordIDNonStemmed ( sTmp ) : pDict->GetWordID ( sTmp ); pWord->m_sDictWord = (char*)sTmp; pWord->m_bExpanded = tWord.m_bExpanded; tSetup.QwordSetup ( pWord ); if ( tWord.m_bFieldStart && tWord.m_bFieldEnd ) pWord->m_iTermPos = TERM_POS_FIELD_STARTEND; else if ( tWord.m_bFieldStart ) pWord->m_iTermPos = TERM_POS_FIELD_START; else if ( tWord.m_bFieldEnd ) pWord->m_iTermPos = TERM_POS_FIELD_END; else pWord->m_iTermPos = 0; pWord->m_iAtomPos = tWord.m_iAtomPos; return pWord; } template < typename T > static ExtNode_i * CreateMultiNode ( const XQNode_t * pQueryNode, const ISphQwordSetup & tSetup, bool bNeedsHitlist ) { /////////////////////////////////// // virtually plain (expanded) node /////////////////////////////////// if ( pQueryNode->m_dChildren.GetLength() ) { CSphVector dNodes; ARRAY_FOREACH ( i, pQueryNode->m_dChildren ) { ExtNode_i * pTerm = ExtNode_i::Create ( pQueryNode->m_dChildren[i], tSetup ); assert ( !pTerm || pTerm->m_iAtomPos>=0 ); if ( pTerm ) dNodes.Add ( pTerm ); } if ( dNodes.GetLength()<2 ) { ARRAY_FOREACH ( i, dNodes ) SafeDelete ( dNodes[i] ); if ( tSetup.m_pWarning ) tSetup.m_pWarning->SetSprintf ( "can't create phrase node, hitlists unavailable (hitlists=%d, nodes=%d)", dNodes.GetLength(), pQueryNode->m_dChildren.GetLength() ); return NULL; } // FIXME! tricky combo again // quorum+expand used KeywordsEqual() path to drill down until actual nodes // that path is no longer and quorum+expand+dupes might probably fail now // the proper solution would be to have dNodes return proper wordids somehow ExtNode_i * pResult = new T ( dNodes, *pQueryNode, tSetup ); if ( pQueryNode->GetCount() ) return tSetup.m_pNodeCache->CreateProxy ( pResult, pQueryNode, tSetup ); return pResult; // FIXME! sorry, no hitless vs expand vs phrase support for now! } ////////////////////// // regular plain node ////////////////////// ExtNode_i * pResult = NULL; CSphVector dQwordsHit; // have hits CSphVector dQwords; // don't have hits // partition phrase words const CSphVector & dWords = pQueryNode->m_dWords; ARRAY_FOREACH ( i, dWords ) { ISphQword * pWord = CreateQueryWord ( dWords[i], tSetup ); if ( pWord->m_bHasHitlist || !bNeedsHitlist ) dQwordsHit.Add ( pWord ); else dQwords.Add ( pWord ); } // see if we can create the node if ( dQwordsHit.GetLength()<2 ) { ARRAY_FOREACH ( i, dQwords ) SafeDelete ( dQwords[i] ); ARRAY_FOREACH ( i, dQwordsHit ) SafeDelete ( dQwordsHit[i] ); if ( tSetup.m_pWarning ) tSetup.m_pWarning->SetSprintf ( "can't create phrase node, hitlists unavailable (hitlists=%d, nodes=%d)", dQwordsHit.GetLength(), dWords.GetLength() ); return NULL; } else { // at least two words have hitlists, creating phrase node assert ( pQueryNode ); assert ( pQueryNode->m_dWords.GetLength() ); assert ( pQueryNode->GetOp()==SPH_QUERY_PHRASE || pQueryNode->GetOp()==SPH_QUERY_PROXIMITY || pQueryNode->GetOp()==SPH_QUERY_QUORUM ); // create nodes CSphVector dNodes; ARRAY_FOREACH ( i, dQwordsHit ) { dNodes.Add ( ExtNode_i::Create ( dQwordsHit[i], pQueryNode, tSetup ) ); dNodes.Last()->m_iAtomPos = dQwordsHit[i]->m_iAtomPos; } pResult = new T ( dNodes, *pQueryNode, tSetup ); } // AND result with the words that had no hitlist if ( dQwords.GetLength() ) { ExtNode_i * pNode = ExtNode_i::Create ( dQwords[0], pQueryNode, tSetup ); for ( int i=1; iGetCount() ) return tSetup.m_pNodeCache->CreateProxy ( pResult, pQueryNode, tSetup ); return pResult; } static ExtNode_i * CreateOrderNode ( const XQNode_t * pNode, const ISphQwordSetup & tSetup ) { if ( pNode->m_dChildren.GetLength()<2 ) { if ( tSetup.m_pWarning ) tSetup.m_pWarning->SetSprintf ( "order node requires at least two children" ); return NULL; } CSphVector dChildren; ARRAY_FOREACH ( i, pNode->m_dChildren ) { ExtNode_i * pChild = ExtNode_i::Create ( pNode->m_dChildren[i], tSetup ); if ( !pChild || pChild->GotHitless() ) { if ( tSetup.m_pWarning ) tSetup.m_pWarning->SetSprintf ( "failed to create order node, hitlist unavailable" ); ARRAY_FOREACH ( j, dChildren ) SafeDelete ( dChildren[j] ); return NULL; } dChildren.Add ( pChild ); } ExtNode_i * pResult = new ExtOrder_c ( dChildren, tSetup ); if ( pNode->GetCount() ) return tSetup.m_pNodeCache->CreateProxy ( pResult, pNode, tSetup ); return pResult; } ExtNode_i * ExtNode_i::Create ( const XQKeyword_t & tWord, const XQNode_t * pNode, const ISphQwordSetup & tSetup ) { return Create ( CreateQueryWord ( tWord, tSetup ), pNode, tSetup ); }; ExtNode_i * ExtNode_i::Create ( ISphQword * pQword, const XQNode_t * pNode, const ISphQwordSetup & tSetup ) { assert ( pQword ); if ( pNode->m_dSpec.m_iFieldMaxPos ) pQword->m_iTermPos = TERM_POS_FIELD_LIMIT; if ( pNode->m_dSpec.m_dZones.GetLength() ) pQword->m_iTermPos = pNode->m_dSpec.m_bZoneSpan ? TERM_POS_ZONESPAN : TERM_POS_ZONES; if ( !pQword->m_bHasHitlist ) { if ( tSetup.m_pWarning && pQword->m_iTermPos ) tSetup.m_pWarning->SetSprintf ( "hitlist unavailable, position limit ignored" ); return new ExtTermHitless_c ( pQword, pNode->m_dSpec.m_dFieldMask, tSetup, pNode->m_bNotWeighted ); } switch ( pQword->m_iTermPos ) { case TERM_POS_FIELD_STARTEND: return new ExtTermPos_c ( pQword, pNode, tSetup ); case TERM_POS_FIELD_START: return new ExtTermPos_c ( pQword, pNode, tSetup ); case TERM_POS_FIELD_END: return new ExtTermPos_c ( pQword, pNode, tSetup ); case TERM_POS_FIELD_LIMIT: return new ExtTermPos_c ( pQword, pNode, tSetup ); case TERM_POS_ZONES: return new ExtTermPos_c ( pQword, pNode, tSetup ); case TERM_POS_ZONESPAN: return new ExtTermPos_c ( pQword, pNode, tSetup ); default: return new ExtTerm_c ( pQword, pNode->m_dSpec.m_dFieldMask, tSetup, pNode->m_bNotWeighted ); } } ////////////////////////////////////////////////////////////////////////// struct ExtCacheEntry_t { SphDocID_t m_uDocid; Hitpos_t m_uHitpos; union { int m_iQword; ///< filled in ctor, used in SetQwordsIDF() float m_fTFIDF; ///< filled in SetQwordsIDF(), used in searching }; bool operator < ( const ExtCacheEntry_t & r ) const { if ( m_uDocid!=r.m_uDocid ) return m_uDocid < r.m_uDocid; return m_uHitpos < r.m_uHitpos; } }; struct ExtCachedKeyword_t : public XQKeyword_t { CSphString m_sDictWord; SphWordID_t m_iWordID; float m_fIDF; int m_iDocs; int m_iHits; }; /// simple in-memory multi-term cache class ExtCached_c : public ExtNode_i { protected: CSphVector m_dCache; CSphFixedVector m_dWords; CSphSmallBitvec m_dFieldMask; bool m_bExcluded; int m_iUniqDocs; int m_iCurDocsBegin; ///< start of the last docs chunk returned, inclusive, ie [begin,end) int m_iCurDocsEnd; ///< end of the last docs chunk returned, exclusive, ie [begin,end) int m_iCurHit; ///< end of the last hits chunk (within the last docs chunk) returned, exclusive uint64_t m_uWordID; public: explicit ExtCached_c ( const XQNode_t * pNode, const ISphQwordSetup & tSetup ); virtual void Reset ( const ISphQwordSetup & tSetup ); virtual void HintDocid ( SphDocID_t ) {} virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ); virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t ); virtual int GetQwords ( ExtQwordsHash_t & hQwords ); virtual void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ); virtual void GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ); virtual bool GotHitless () { return false; } virtual int GetDocsCount () { return m_iUniqDocs; } virtual uint64_t GetWordID () const { return m_uWordID; } void FillCacheIDF (); void PopulateCache ( const ISphQwordSetup & tSetup, bool bFillStat ); }; ExtCached_c::ExtCached_c ( const XQNode_t * pNode, const ISphQwordSetup & tSetup ) : m_dWords ( pNode->m_dWords.GetLength() ) { #ifndef NDEBUG // sanity checks // this node must be only created for a huge OR of tiny expansions assert ( pNode->GetOp()==SPH_QUERY_OR ); assert ( pNode->m_dWords.GetLength() ); assert ( tSetup.m_eDocinfo!=SPH_DOCINFO_INLINE ); ARRAY_FOREACH ( i, pNode->m_dWords ) { assert ( pNode->m_dWords[i].m_iAtomPos==pNode->m_dWords[0].m_iAtomPos ); assert ( pNode->m_dWords[i].m_bExpanded ); } #endif m_iAtomPos = pNode->m_dWords[0].m_iAtomPos; m_dFieldMask = pNode->m_dSpec.m_dFieldMask; m_bExcluded = pNode->m_bNotWeighted; m_dCache.Reserve ( pNode->m_dWords.GetLength() ); m_uWordID = 0; // copy all the keywords BYTE sTmpWord [ 3*SPH_MAX_WORD_LEN + 16 ]; ARRAY_FOREACH ( i, pNode->m_dWords ) { ExtCachedKeyword_t & tWord = m_dWords[i]; tWord.m_sWord = pNode->m_dWords[i].m_sWord; tWord.m_bFieldStart = pNode->m_dWords[i].m_bFieldStart; tWord.m_bFieldEnd = pNode->m_dWords[i].m_bFieldEnd; tWord.m_iDocs = 0; tWord.m_iHits = 0; // our little stemming buffer (morphology aware dictionary might need to change the keyword) strncpy ( (char*)sTmpWord, tWord.m_sWord.cstr(), sizeof(sTmpWord) ); sTmpWord[sizeof(sTmpWord)-1] = '\0'; // setup keyword disk reader tWord.m_iWordID = tSetup.m_pDict->GetWordID ( sTmpWord ); tWord.m_sDictWord = (const char *)sTmpWord; m_uWordID ^= tWord.m_iWordID; } PopulateCache ( tSetup, true ); } void ExtCached_c::PopulateCache ( const ISphQwordSetup & tSetup, bool bFillStat ) { // loop all the keywords and precache them ARRAY_FOREACH ( i, m_dWords ) { const ExtCachedKeyword_t & tWord = m_dWords[i]; ISphQword * pQword = tSetup.QwordSpawn ( tWord ); pQword->m_sWord = tWord.m_sWord; pQword->m_iWordID = tWord.m_iWordID; pQword->m_sDictWord = tWord.m_sDictWord; pQword->m_bExpanded = true; bool bOk = tSetup.QwordSetup ( pQword ); // setup keyword idf and stats if ( bFillStat ) { m_dWords[i].m_iDocs = pQword->m_iDocs; m_dWords[i].m_iHits = pQword->m_iHits; m_iUniqDocs += pQword->m_iDocs; } if ( !bOk ) { SafeDelete ( pQword ) continue; } // read and cache all docs and hits for ( ;; ) { const CSphMatch & tMatch = pQword->GetNextDoc ( NULL ); if ( !tMatch.m_iDocID ) break; pQword->SeekHitlist ( pQword->m_iHitlistPos ); for ( Hitpos_t uHit = pQword->GetNextHit(); uHit!=EMPTY_HIT; uHit = pQword->GetNextHit() ) { // apply field limits if ( !m_dFieldMask.Test ( HITMAN::GetField(uHit) ) ) continue; // FIXME!!! apply zone limits too // apply field-start/field-end modifiers if ( tWord.m_bFieldStart && HITMAN::GetPos(uHit)!=1 ) continue; if ( tWord.m_bFieldEnd && !HITMAN::IsEnd(uHit) ) continue; // ok, this hit works, copy it ExtCacheEntry_t & tEntry = m_dCache.Add (); tEntry.m_uDocid = tMatch.m_iDocID; tEntry.m_uHitpos = uHit; tEntry.m_iQword = i; } } // dismissed SafeDelete ( pQword ); } // sort from keyword order to document order m_dCache.Sort(); if ( bFillStat && m_dCache.GetLength() ) { m_iUniqDocs = 1; for ( int i=1; i=m_dCache.GetLength() ) return NULL; int iDoc = 0; int iEnd = m_iCurDocsEnd; // shortcut, and vs2005 optimization while ( iDoc=m_iCurDocsEnd ) return NULL; int iHit = 0; while ( pDocs->m_uDocid!=DOCID_MAX ) { // skip rejected documents while ( m_iCurHit < m_iCurDocsEnd && m_dCache[m_iCurHit].m_uDocid < pDocs->m_uDocid ) m_iCurHit++; if ( m_iCurHit>=m_iCurDocsEnd ) break; // skip non-matching documents SphDocID_t uDocid = m_dCache[m_iCurHit].m_uDocid; if ( pDocs->m_uDocid < uDocid ) { while ( pDocs->m_uDocid < uDocid ) pDocs++; if ( pDocs->m_uDocid!=uDocid ) continue; } // copy accepted documents while ( m_iCurHit < m_iCurDocsEnd && m_dCache[m_iCurHit].m_uDocid==pDocs->m_uDocid && iHit < MAX_HITS-1 ) { ExtHit_t & tHit = m_dHits[iHit++]; tHit.m_uDocid = m_dCache[m_iCurHit].m_uDocid; tHit.m_uHitpos = m_dCache[m_iCurHit].m_uHitpos; tHit.m_uQuerypos = (WORD) m_iAtomPos; tHit.m_uWeight = tHit.m_uMatchlen = tHit.m_uSpanlen = 1; m_iCurHit++; } if ( m_iCurHit>=m_iCurDocsEnd || iHit>=MAX_HITS-1 ) break; } assert ( iHit>=0 && iHitm_fIDF; } FillCacheIDF(); } void ExtCached_c::GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ) { if ( m_bExcluded ) return; ARRAY_FOREACH ( i, m_dWords ) { const ExtQword_t & tQword = hQwords[m_dWords[i].m_sWord]; dTermDupes[m_iAtomPos] = (WORD)tQword.m_iQueryPos; } } void ExtCached_c::FillCacheIDF () { // compute tf*idf for every document // store it into the opening hit CSphVector dWords; for ( int i=0; iIsEmpty() ) return NULL; if ( pNode->m_dWords.GetLength() || pNode->m_bVirtuallyPlain ) { const int iWords = pNode->m_bVirtuallyPlain ? pNode->m_dChildren.GetLength() : pNode->m_dWords.GetLength(); if ( iWords==1 ) { if ( pNode->m_bVirtuallyPlain ) return Create ( pNode->m_dChildren[0], tSetup ); else return Create ( pNode->m_dWords[0], pNode, tSetup ); } switch ( pNode->GetOp() ) { case SPH_QUERY_PHRASE: return CreateMultiNode ( pNode, tSetup, true ); case SPH_QUERY_PROXIMITY: return CreateMultiNode ( pNode, tSetup, true ); case SPH_QUERY_NEAR: return CreateMultiNode ( pNode, tSetup, true ); case SPH_QUERY_QUORUM: { assert ( pNode->m_dWords.GetLength()==0 || pNode->m_dChildren.GetLength()==0 ); int iQuorumCount = pNode->m_dWords.GetLength()+pNode->m_dChildren.GetLength(); if ( ExtQuorum_c::GetThreshold ( *pNode, iQuorumCount )>=iQuorumCount ) { // threshold is too high if ( tSetup.m_pWarning && !pNode->m_bPercentOp ) tSetup.m_pWarning->SetSprintf ( "quorum threshold too high (words=%d, thresh=%d); replacing quorum operator with AND operator", iQuorumCount, pNode->m_iOpArg ); } else if ( iQuorumCount>256 ) { // right now quorum can only handle 256 words if ( tSetup.m_pWarning ) tSetup.m_pWarning->SetSprintf ( "too many words (%d) for quorum; replacing with an AND", iQuorumCount ); } else // everything is ok; create quorum node { return CreateMultiNode ( pNode, tSetup, false ); } // couldn't create quorum, make an AND node instead CSphVector dTerms; dTerms.Reserve ( iQuorumCount ); ARRAY_FOREACH ( i, pNode->m_dWords ) dTerms.Add ( Create ( pNode->m_dWords[i], pNode, tSetup ) ); ARRAY_FOREACH ( i, pNode->m_dChildren ) dTerms.Add ( Create ( pNode->m_dChildren[i], tSetup ) ); // make not simple, but optimized AND node. dTerms.Sort ( ExtNodeTF_fn() ); ExtNode_i * pCur = dTerms[0]; for ( int i=1; iGetCount() ) return tSetup.m_pNodeCache->CreateProxy ( pCur, pNode, tSetup ); return pCur; } case SPH_QUERY_OR: // currently, only intended for tiny (super-rare) expanded keywords // might be a nice small optimization to remove the expanded restriction, though #ifndef NDEBUG assert ( pNode->m_dWords.GetLength() ); ARRAY_FOREACH ( i, pNode->m_dWords ) assert ( pNode->m_dWords[i].m_bExpanded ); #endif return new ExtCached_c ( pNode, tSetup ); default: assert ( 0 && "unexpected plain node type" ); return NULL; } } else { int iChildren = pNode->m_dChildren.GetLength (); assert ( iChildren>0 ); // special case, operator BEFORE if ( pNode->GetOp()==SPH_QUERY_BEFORE ) return CreateOrderNode ( pNode, tSetup ); // special case, AND over terms (internally reordered for speed) bool bAndTerms = ( pNode->GetOp()==SPH_QUERY_AND ); bool bZonespan = true; bool bZonespanChecked = false; for ( int i=0; im_dChildren[i]; bAndTerms = ( pChildren->m_dWords.GetLength()==1 ); bZonespan &= pChildren->m_dSpec.m_bZoneSpan; if ( !bZonespan ) break; bZonespanChecked = true; } bZonespan &= bZonespanChecked; if ( bAndTerms ) { if ( bZonespan ) { CSphVector dTerms; for ( int i=0; im_dChildren[i]; ExtNode_i * pTerm = ExtNode_i::Create ( pChild, tSetup ); if ( pTerm ) dTerms.Add ( pTerm ); } dTerms.Sort ( ExtNodeTF_fn() ); ExtNode_i * pCur = dTerms[0]; for ( int i=1; im_dChildren[0] ); // For zonespan we have also Extra data which is not (yet?) covered by common-node optimization. // if ( pNode->GetCount() ) // return tSetup.m_pNodeCache->CreateProxy ( pCur, pNode, tSetup ); return pCur; } else { CSphVector dTerms; for ( int i=0; im_dChildren[i]; ExtNode_i * pTerm = ExtNode_i::Create ( pChild, tSetup ); if ( pTerm ) dTerms.Add ( pTerm ); } dTerms.Sort ( ExtNodeTF_fn() ); ExtNode_i * pCur = dTerms[0]; for ( int i=1; iGetCount() ) return tSetup.m_pNodeCache->CreateProxy ( pCur, pNode, tSetup ); return pCur; } } // Multinear and phrase could be also non-plain, so here is the second entry for it. if ( pNode->GetOp()==SPH_QUERY_NEAR ) return CreateMultiNode ( pNode, tSetup, true ); if ( pNode->GetOp()==SPH_QUERY_PHRASE ) return CreateMultiNode ( pNode, tSetup, true ); // generic create ExtNode_i * pCur = NULL; for ( int i=0; im_dChildren[i], tSetup ); if ( !pNext ) continue; if ( !pCur ) { pCur = pNext; continue; } switch ( pNode->GetOp() ) { case SPH_QUERY_OR: pCur = new ExtOr_c ( pCur, pNext, tSetup ); break; case SPH_QUERY_AND: pCur = new ExtAnd_c ( pCur, pNext, tSetup ); break; case SPH_QUERY_ANDNOT: pCur = new ExtAndNot_c ( pCur, pNext, tSetup ); break; case SPH_QUERY_SENTENCE: pCur = new ExtUnit_c ( pCur, pNext, pNode->m_dSpec.m_dFieldMask, tSetup, MAGIC_WORD_SENTENCE ); break; case SPH_QUERY_PARAGRAPH: pCur = new ExtUnit_c ( pCur, pNext, pNode->m_dSpec.m_dFieldMask, tSetup, MAGIC_WORD_PARAGRAPH ); break; default: assert ( 0 && "internal error: unhandled op in ExtNode_i::Create()" ); break; } } if ( pCur && pNode->GetCount() ) return tSetup.m_pNodeCache->CreateProxy ( pCur, pNode, tSetup ); return pCur; } } ////////////////////////////////////////////////////////////////////////// inline void ExtTerm_c::Init ( ISphQword * pQword, const CSphSmallBitvec & dFields, const ISphQwordSetup & tSetup, bool bNotWeighted ) { m_pQword = pQword; m_pWarning = tSetup.m_pWarning; m_bNotWeighted = bNotWeighted; m_iAtomPos = pQword->m_iAtomPos; m_pLastChecked = m_dDocs; m_uMatchChecked = 0; m_bTail = false; m_dQueriedFields = dFields; m_bHasWideFields = false; if ( tSetup.m_pIndex && tSetup.m_pIndex->GetMatchSchema().m_dFields.GetLength()>32 ) for ( int i=1; i<8; i++ ) if ( m_dQueriedFields.m_dFieldsMask[i] ) m_bHasWideFields = true; m_iMaxTimer = tSetup.m_iMaxTimer; m_pStats = tSetup.m_pStats; m_pNanoBudget = m_pStats ? m_pStats->m_pNanoBudget : NULL; AllocDocinfo ( tSetup ); } ExtTerm_c::ExtTerm_c ( ISphQword * pQword, const ISphQwordSetup & tSetup ) : m_pQword ( pQword ) , m_pWarning ( tSetup.m_pWarning ) , m_bNotWeighted ( true ) { m_iAtomPos = pQword->m_iAtomPos; m_pLastChecked = m_dDocs; m_uMatchChecked = 0; m_bTail = false; m_dQueriedFields.Set(); m_bHasWideFields = tSetup.m_pIndex && ( tSetup.m_pIndex->GetMatchSchema().m_dFields.GetLength()>32 ); m_iMaxTimer = tSetup.m_iMaxTimer; m_pStats = tSetup.m_pStats; m_pNanoBudget = m_pStats ? m_pStats->m_pNanoBudget : NULL; AllocDocinfo ( tSetup ); } ExtTerm_c::ExtTerm_c ( ISphQword * pQword, const CSphSmallBitvec & dFields, const ISphQwordSetup & tSetup, bool bNotWeighted ) { Init ( pQword, dFields, tSetup, bNotWeighted ); } void ExtTerm_c::Reset ( const ISphQwordSetup & tSetup ) { m_pLastChecked = m_dDocs; m_uMatchChecked = 0; m_bTail = false; m_iMaxTimer = tSetup.m_iMaxTimer; m_pQword->Reset (); tSetup.QwordSetup ( m_pQword ); } const ExtDoc_t * ExtTerm_c::GetDocsChunk ( SphDocID_t * pMaxID ) { m_pLastChecked = m_dDocs; m_bTail = false; if ( !m_pQword->m_iDocs ) return NULL; m_uMaxID = 0; // max_query_time if ( m_iMaxTimer>0 && sphMicroTimer()>=m_iMaxTimer ) { if ( m_pWarning ) *m_pWarning = "query time exceeded max_query_time"; return NULL; } // max_predicted_time if ( m_pNanoBudget && *m_pNanoBudget<0 ) { if ( m_pWarning ) *m_pWarning = "predicted query time exceeded max_predicted_time"; return NULL; } // interrupt by sitgerm if ( m_bInterruptNow ) { if ( m_pWarning ) *m_pWarning = "Server shutdown in progress"; return NULL; } int iDoc = 0; CSphRowitem * pDocinfo = m_pDocinfo; while ( iDocGetNextDoc ( pDocinfo ); if ( !tMatch.m_iDocID ) { m_pQword->m_iDocs = 0; break; } if ( !m_bHasWideFields ) { // fields 0-31 can be quickly checked right here, right now if (!( m_pQword->m_dQwordFields.m_dFieldsMask[0] & m_dQueriedFields.m_dFieldsMask[0] )) continue; } else { // fields 32+ need to be checked with CollectHitMask() and stuff m_pQword->CollectHitMask(); if (!( m_pQword->m_dQwordFields.Test ( m_dQueriedFields ) )) continue; } ExtDoc_t & tDoc = m_dDocs[iDoc++]; tDoc.m_uDocid = tMatch.m_iDocID; tDoc.m_pDocinfo = pDocinfo; tDoc.m_uHitlistOffset = m_pQword->m_iHitlistPos; tDoc.m_uDocFields = m_pQword->m_dQwordFields.GetMask32() & m_dQueriedFields.GetMask32(); // OPTIMIZE: only needed for phrase node tDoc.m_fTFIDF = float(m_pQword->m_uMatchHits) / float(m_pQword->m_uMatchHits+SPH_BM25_K1) * m_fIDF; pDocinfo += m_iStride; } if ( m_pStats ) m_pStats->m_iFetchedDocs += iDoc; if ( m_pNanoBudget ) *m_pNanoBudget -= g_iPredictorCostDoc*iDoc; return ReturnDocsChunk ( iDoc, pMaxID ); } const ExtHit_t * ExtTerm_c::GetHitsChunk ( const ExtDoc_t * pMatched, SphDocID_t uMaxID ) { if ( !pMatched ) return NULL; // aim to the right document ExtDoc_t * pDoc = m_pLastChecked; assert ( pDoc && pDoc>=m_dDocs && pDocm_uDocid==m_uMatchChecked ) return NULL; // early reject whole block if ( pMatched->m_uDocid > m_uMaxID ) return NULL; if ( m_uMaxID && m_dDocs[0].m_uDocid > uMaxID ) return NULL; // find match m_uMatchChecked = pMatched->m_uDocid; do { while ( pDoc->m_uDocid < pMatched->m_uDocid ) pDoc++; m_pLastChecked = pDoc; if ( pDoc->m_uDocid==DOCID_MAX ) return NULL; // matched docs block is over for me, gimme another one while ( pMatched->m_uDocid < pDoc->m_uDocid ) pMatched++; if ( pMatched->m_uDocid==DOCID_MAX ) return NULL; // matched doc block did not yet begin for me, gimme another one } while ( pDoc->m_uDocid!=pMatched->m_uDocid ); // setup hitlist reader m_pQword->SeekHitlist ( pDoc->m_uHitlistOffset ); } // hit emission int iHit = 0; m_bTail = false; while ( iHitGetNextHit(); if ( uHit==EMPTY_HIT ) { // no more hits; get next acceptable document pDoc++; m_pLastChecked = pDoc; do { while ( pDoc->m_uDocid < pMatched->m_uDocid ) pDoc++; m_pLastChecked = pDoc; if ( pDoc->m_uDocid==DOCID_MAX ) { pDoc = NULL; break; } // matched docs block is over for me, gimme another one while ( pMatched->m_uDocid < pDoc->m_uDocid ) pMatched++; if ( pMatched->m_uDocid==DOCID_MAX ) { pDoc = NULL; break; } // matched doc block did not yet begin for me, gimme another one } while ( pDoc->m_uDocid!=pMatched->m_uDocid ); if ( !pDoc ) break; assert ( pDoc->m_uDocid==pMatched->m_uDocid ); // setup hitlist reader m_pQword->SeekHitlist ( pDoc->m_uHitlistOffset ); continue; } if (!( m_dQueriedFields.Test ( HITMAN::GetField ( uHit ) ) )) continue; ExtHit_t & tHit = m_dHits[iHit++]; tHit.m_uDocid = pDoc->m_uDocid; tHit.m_uHitpos = uHit; tHit.m_uQuerypos = (WORD) m_iAtomPos; // assume less that 64K words per query tHit.m_uWeight = tHit.m_uMatchlen = tHit.m_uSpanlen = 1; } if ( iHit==MAX_HITS-1 ) m_bTail = true; if ( m_pStats ) m_pStats->m_iFetchedHits += iHit; if ( m_pNanoBudget ) *m_pNanoBudget -= g_iPredictorCostHit*iHit; assert ( iHit>=0 && iHitm_sWord ); if ( !m_bNotWeighted && pQword && !pQword->m_bExcluded ) pQword->m_iQueryPos = Min ( pQword->m_iQueryPos, m_pQword->m_iAtomPos ); if ( m_bNotWeighted || pQword ) return m_pQword->m_bExcluded ? -1 : m_pQword->m_iAtomPos; m_fIDF = -1.0f; ExtQword_t tInfo; tInfo.m_sWord = m_pQword->m_sWord; tInfo.m_sDictWord = m_pQword->m_sDictWord; tInfo.m_iDocs = m_pQword->m_iDocs; tInfo.m_iHits = m_pQword->m_iHits; tInfo.m_iQueryPos = m_pQword->m_iAtomPos; tInfo.m_fIDF = -1.0f; // suppress gcc 4.2.3 warning tInfo.m_bExpanded = m_pQword->m_bExpanded; tInfo.m_bExcluded = m_pQword->m_bExcluded; hQwords.Add ( tInfo, m_pQword->m_sWord ); return m_pQword->m_bExcluded ? -1 : m_pQword->m_iAtomPos; } void ExtTerm_c::SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) { if ( m_fIDF<0.0f ) { assert ( hQwords ( m_pQword->m_sWord ) ); m_fIDF = hQwords ( m_pQword->m_sWord )->m_fIDF; } } void ExtTerm_c::GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ) { if ( m_bNotWeighted || m_pQword->m_bExcluded ) return; ExtQword_t & tQword = hQwords[ m_pQword->m_sWord ]; dTermDupes[m_pQword->m_iAtomPos] = (WORD)tQword.m_iQueryPos; } ////////////////////////////////////////////////////////////////////////// const ExtHit_t * ExtTermHitless_c::GetHitsChunk ( const ExtDoc_t * pMatched, SphDocID_t uMaxID ) { if ( !pMatched ) return NULL; // aim to the right document ExtDoc_t * pDoc = m_pLastChecked; assert ( pDoc && pDoc>=m_dDocs && pDocm_uDocid==m_uMatchChecked ) return NULL; // early reject whole block if ( pMatched->m_uDocid > m_uMaxID ) return NULL; if ( m_uMaxID && m_dDocs[0].m_uDocid > uMaxID ) return NULL; // find match m_uMatchChecked = pMatched->m_uDocid; do { while ( pDoc->m_uDocid < pMatched->m_uDocid ) pDoc++; m_pLastChecked = pDoc; if ( pDoc->m_uDocid==DOCID_MAX ) return NULL; // matched docs block is over for me, gimme another one while ( pMatched->m_uDocid < pDoc->m_uDocid ) pMatched++; if ( pMatched->m_uDocid==DOCID_MAX ) return NULL; // matched doc block did not yet begin for me, gimme another one } while ( pDoc->m_uDocid!=pMatched->m_uDocid ); m_uFieldPos = 0; } // hit emission int iHit = 0; m_bTail = false; for ( ;; ) { if ( ( m_uFieldPos<32 && ( pDoc->m_uDocFields & ( 1 << m_uFieldPos ) ) ) // not necessary && m_dQueriedFields.Test ( m_uFieldPos ) ) { // emit hit ExtHit_t & tHit = m_dHits[iHit++]; tHit.m_uDocid = pDoc->m_uDocid; tHit.m_uHitpos = HITMAN::Create ( m_uFieldPos, -1 ); tHit.m_uQuerypos = (WORD) m_iAtomPos; tHit.m_uWeight = tHit.m_uMatchlen = tHit.m_uSpanlen = 1; if ( iHit==MAX_HITS-1 ) break; } if ( m_uFieldPos < CSphSmallBitvec::iTOTALBITS-1 ) { m_uFieldPos++; continue; } // field mask is empty, get next document pDoc++; m_pLastChecked = pDoc; do { while ( pDoc->m_uDocid < pMatched->m_uDocid ) pDoc++; m_pLastChecked = pDoc; if ( pDoc->m_uDocid==DOCID_MAX ) { pDoc = NULL; break; } // matched docs block is over for me, gimme another one while ( pMatched->m_uDocid < pDoc->m_uDocid ) pMatched++; if ( pMatched->m_uDocid==DOCID_MAX ) { pDoc = NULL; break; } // matched doc block did not yet begin for me, gimme another one } while ( pDoc->m_uDocid!=pMatched->m_uDocid ); if ( !pDoc ) break; m_uFieldPos = 0; } if ( iHit==MAX_HITS-1 ) m_bTail = true; assert ( iHit>=0 && iHit ExtConditional::ExtConditional ( ISphQword * pQword, const XQNode_t * pNode, const ISphQwordSetup & tSetup ) : BufferedNode_c () , ExtBase () , t_Acceptor ( pQword, pNode, tSetup ) { ExtBase::AllocDocinfo ( tSetup ); } template < TermPosFilter_e T, class ExtBase > void ExtConditional::Reset ( const ISphQwordSetup & tSetup ) { BufferedNode_c::Reset(); ExtBase::Reset(tSetup); TermAcceptor_c::Reset(); } inline bool TermAcceptor_c::IsAcceptableHit ( const ExtHit_t * pHit ) const { return HITMAN::GetPos ( pHit->m_uHitpos )<=m_iMaxFieldPos; } template<> inline bool TermAcceptor_c::IsAcceptableHit ( const ExtHit_t * pHit ) const { return HITMAN::GetPos ( pHit->m_uHitpos )==1; } template<> inline bool TermAcceptor_c::IsAcceptableHit ( const ExtHit_t * pHit ) const { return HITMAN::IsEnd ( pHit->m_uHitpos ); } template<> inline bool TermAcceptor_c::IsAcceptableHit ( const ExtHit_t * pHit ) const { return HITMAN::GetPos ( pHit->m_uHitpos )==1 && HITMAN::IsEnd ( pHit->m_uHitpos ); } inline bool TermAcceptor_c::IsAcceptableHit ( const ExtHit_t * pHit ) const { assert ( m_pZoneChecker ); if ( m_uLastZonedId!=pHit->m_uDocid ) m_iCheckFrom = 0; m_uLastZonedId = pHit->m_uDocid; // only check zones that actually match this document for ( int i=m_iCheckFrom; iIsInZone ( m_dZones[i], pHit ); switch ( eState ) { case SPH_ZONE_FOUND: return true; case SPH_ZONE_NO_DOCUMENT: Swap ( m_dZones[i], m_dZones[m_iCheckFrom] ); m_iCheckFrom++; break; default: break; } } return false; } inline bool TermAcceptor_c::IsAcceptableHit ( const ExtHit_t * pHit ) const { assert ( m_pZoneChecker ); int * pZones = ( m_bFinal ? m_dFinalZones.GetZVec ( m_iMyHit ) : m_dMyZones.GetZVec ( m_iMyHit ) ); bool bRes = false; // only check zones that actually match this document ARRAY_FOREACH ( i, m_dZones ) bRes |= ( m_pZoneChecker->IsInZone ( m_dZones[i], pHit, pZones + i )==SPH_ZONE_FOUND ); return bRes; } template < TermPosFilter_e T, class ExtBase > const ExtDoc_t * ExtConditional::GetDocsChunk ( SphDocID_t * pMaxID ) { SphDocID_t uSkipID = m_uLastID; // fetch more docs if needed if ( !m_pRawDocs ) { m_pRawDocs = ExtBase::GetDocsChunk ( &m_uTermMaxID ); if ( !m_pRawDocs ) return NULL; m_pRawDoc = m_pRawDocs; m_pRawHit = NULL; uSkipID = 0; } // filter the hits, and build the documents list int iMyDoc = 0; int iMyHit = 0; TermAcceptor_c::SetMyHit(0); const ExtDoc_t * pDoc = m_pRawDoc; // just a shortcut const ExtHit_t * pHit = m_pRawHit; SphDocID_t uLastID = m_uLastID = 0; CSphRowitem * pDocinfo = ExtBase::m_pDocinfo; for ( ;; ) { // try to fetch more hits for current raw docs block if we're out if ( !pHit || pHit->m_uDocid==DOCID_MAX ) pHit = ExtBase::GetHitsChunk ( m_pRawDocs, m_uTermMaxID ); // did we touch all the hits we had? if so, we're fully done with // current raw docs block, and should start a new one if ( !pHit ) { m_pRawDocs = ExtBase::GetDocsChunk ( &m_uTermMaxID ); if ( !m_pRawDocs ) // no more incoming documents? bail break; pDoc = m_pRawDocs; pHit = NULL; continue; } // skip all tail hits hits from documents below or same ID as uSkipID // scan until next acceptable hit while ( pHit->m_uDocid < pDoc->m_uDocid || ( uSkipID && pHit->m_uDocid<=uSkipID ) ) // skip leftovers pHit++; while ( ( pHit->m_uDocid!=DOCID_MAX || ( uSkipID && pHit->m_uDocid<=uSkipID ) ) && !t_Acceptor::IsAcceptableHit ( pHit ) ) // skip unneeded hits pHit++; if ( pHit->m_uDocid==DOCID_MAX || ( uSkipID && pHit->m_uDocid<=uSkipID ) ) // check for eof continue; // find and emit new document while ( pDoc->m_uDocidm_uDocid ) pDoc++; // FIXME? unsafe in broken cases assert ( pDoc->m_uDocid==pHit->m_uDocid ); assert ( iMyDocm_uDocid ) CopyExtDoc ( m_dMyDocs[iMyDoc++], *pDoc, &pDocinfo, ExtBase::m_iStride ); uLastID = pDoc->m_uDocid; // current hit is surely acceptable. m_dMyHits[iMyHit++] = *(pHit++); TermAcceptor_c::SetMyHit ( iMyHit ); // copy acceptable hits for this document while ( iMyHitm_uDocid==uLastID ) { if ( t_Acceptor::IsAcceptableHit ( pHit ) ) { m_dMyHits[iMyHit++] = *pHit; TermAcceptor_c::SetMyHit ( iMyHit ); } pHit++; } if ( iMyHit==ExtBase::MAX_HITS-1 ) { // there is no more space for acceptable hits; but further calls to GetHits() *might* produce some // we need to memorize the trailing document id m_uLastID = uLastID; break; } } m_pRawDoc = pDoc; m_pRawHit = pHit; assert ( iMyDoc>=0 && iMyDoc=0 && iMyHit const ExtHit_t * ExtConditional::GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ) { if ( m_eState==COPY_DONE ) { // this request completed in full if ( m_uDoneFor==pDocs->m_uDocid || !m_uDoneFor ) return NULL; // old request completed in full, but we have a new hits subchunk request now // even though there were no new docs requests in the meantime! m_eState = COPY_FILTERED; } m_uDoneFor = pDocs->m_uDocid; // regular case // copy hits for requested docs from my hits to filtered hits, and return those int iFilteredHits = 0; TermAcceptor_c::SetMyHit ( 0, true ); if ( m_eState==COPY_FILTERED ) { const ExtHit_t * pMyHits = m_dMyHits; const ExtHit_t * pMyHit = pMyHits; for ( ;; ) { // skip hits that the caller is not interested in while ( pMyHit->m_uDocid < pDocs->m_uDocid ) pMyHit++; // out of acceptable hits? if ( pMyHit->m_uDocid==DOCID_MAX ) { // do we have a trailing document? if yes, we should also copy trailing hits m_eState = m_uLastID ? COPY_TRAILING : COPY_DONE; break; } // skip docs that i do not have while ( pDocs->m_uDocid < pMyHit->m_uDocid ) pDocs++; // out of requested docs? over and out if ( pDocs->m_uDocid==DOCID_MAX ) { m_eState = COPY_DONE; break; } // copy matching hits while ( iFilteredHitsm_uDocid==pMyHit->m_uDocid ) { TermAcceptor_c::CopyMyHit ( pMyHit - pMyHits, iFilteredHits ); m_dFilteredHits[iFilteredHits++] = *pMyHit++; } // paranoid check that we're not out of bounds assert ( iFilteredHits<=ExtBase::MAX_HITS-1 && pDocs->m_uDocid!=pMyHit->m_uDocid ); } } // trailing hits case // my hits did not have enough space, so we should pass raw hits for the last doc while ( m_eState==COPY_TRAILING && m_uLastID && iFilteredHitsm_uDocid==DOCID_MAX ) m_pRawHit = ExtBase::GetHitsChunk ( m_pRawDocs, Min ( uMaxID, m_uTermMaxID ) ); // no more hits for current chunk if ( !m_pRawHit ) { m_eState = COPY_DONE; break; } // copy while we can TermAcceptor_c::SetMyHit ( iFilteredHits, true ); while ( m_pRawHit->m_uDocid==m_uLastID && iFilteredHits::SetMyHit ( iFilteredHits, true ); } m_pRawHit++; } // raise the flag for future calls if trailing hits are over if ( m_pRawHit->m_uDocid!=m_uLastID && m_pRawHit->m_uDocid!=DOCID_MAX ) m_eState = COPY_DONE; // in any case, this chunk is over break; } m_dFilteredHits[iFilteredHits].m_uDocid = DOCID_MAX; return iFilteredHits ? m_dFilteredHits : NULL; } ////////////////////////////////////////////////////////////////////////// ExtTwofer_c::ExtTwofer_c ( ExtNode_i * pFirst, ExtNode_i * pSecond, const ISphQwordSetup & tSetup ) { Init ( pFirst, pSecond, tSetup ); } inline void ExtTwofer_c::Init ( ExtNode_i * pFirst, ExtNode_i * pSecond, const ISphQwordSetup & tSetup ) { m_pChildren[0] = pFirst; m_pChildren[1] = pSecond; m_pCurHit[0] = NULL; m_pCurHit[1] = NULL; m_pCurDoc[0] = NULL; m_pCurDoc[1] = NULL; m_dNodePos[0] = 0; m_dNodePos[1] = 0; m_bPosAware = false; m_uMatchedDocid = 0; m_iAtomPos = ( pFirst && pFirst->m_iAtomPos ) ? pFirst->m_iAtomPos : 0; if ( pSecond && pSecond->m_iAtomPos && pSecond->m_iAtomPosm_iAtomPos; AllocDocinfo ( tSetup ); } ExtTwofer_c::~ExtTwofer_c () { SafeDelete ( m_pChildren[0] ); SafeDelete ( m_pChildren[1] ); } void ExtTwofer_c::Reset ( const ISphQwordSetup & tSetup ) { m_pChildren[0]->Reset ( tSetup ); m_pChildren[1]->Reset ( tSetup ); m_pCurHit[0] = NULL; m_pCurHit[1] = NULL; m_pCurDoc[0] = NULL; m_pCurDoc[1] = NULL; m_uMatchedDocid = 0; } int ExtTwofer_c::GetQwords ( ExtQwordsHash_t & hQwords ) { int iMax1 = m_pChildren[0]->GetQwords ( hQwords ); int iMax2 = m_pChildren[1]->GetQwords ( hQwords ); return Max ( iMax1, iMax2 ); } void ExtTwofer_c::SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) { m_pChildren[0]->SetQwordsIDF ( hQwords ); m_pChildren[1]->SetQwordsIDF ( hQwords ); } void ExtTwofer_c::GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ) { m_pChildren[0]->GetTermDupes ( hQwords, dTermDupes ); m_pChildren[1]->GetTermDupes ( hQwords, dTermDupes ); } ////////////////////////////////////////////////////////////////////////// const ExtDoc_t * ExtAnd_c::GetDocsChunk ( SphDocID_t * pMaxID ) { m_uMaxID = 0; const ExtDoc_t * pCur0 = m_pCurDoc[0]; const ExtDoc_t * pCur1 = m_pCurDoc[1]; int iDoc = 0; CSphRowitem * pDocinfo = m_pDocinfo; for ( ;; ) { // if any of the pointers is empty, *and* there is no data yet, process next child chunk // if there is data, we can't advance, because child hitlist offsets would be lost if ( !pCur0 || !pCur1 ) { if ( iDoc!=0 ) break; if ( !pCur0 ) { if ( pCur1 && pCur1->m_uDocid!=DOCID_MAX ) m_pChildren[0]->HintDocid ( pCur1->m_uDocid ); pCur0 = m_pChildren[0]->GetDocsChunk ( NULL ); } if ( !pCur1 ) { if ( pCur0 && pCur0->m_uDocid!=DOCID_MAX ) m_pChildren[1]->HintDocid ( pCur0->m_uDocid ); pCur1 = m_pChildren[1]->GetDocsChunk ( NULL ); } if ( !pCur0 || !pCur1 ) { m_pCurDoc[0] = NULL; m_pCurDoc[1] = NULL; return NULL; } } // find common matches assert ( pCur0 && pCur1 ); while ( iDocm_uDocid < pCur1->m_uDocid ) pCur0++; if ( pCur0->m_uDocid==DOCID_MAX ) { pCur0 = NULL; break; } while ( pCur1->m_uDocid < pCur0->m_uDocid ) pCur1++; if ( pCur1->m_uDocid==DOCID_MAX ) { pCur1 = NULL; break; } if ( pCur0->m_uDocid!=pCur1->m_uDocid ) continue; // emit it ExtDoc_t & tDoc = m_dDocs[iDoc++]; tDoc.m_uDocid = pCur0->m_uDocid; tDoc.m_uDocFields = pCur0->m_uDocFields | pCur1->m_uDocFields; // not necessary tDoc.m_uHitlistOffset = -1; tDoc.m_fTFIDF = pCur0->m_fTFIDF + pCur1->m_fTFIDF; CopyExtDocinfo ( tDoc, *pCur0, &pDocinfo, m_iStride ); // skip it pCur0++; if ( pCur0->m_uDocid==DOCID_MAX ) pCur0 = NULL; pCur1++; if ( pCur1->m_uDocid==DOCID_MAX ) pCur1 = NULL; if ( !pCur0 || !pCur1 ) break; } } m_pCurDoc[0] = pCur0; m_pCurDoc[1] = pCur1; return ReturnDocsChunk ( iDoc, pMaxID ); } const ExtHit_t * ExtAnd_c::GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ) { const ExtHit_t * pCur0 = m_pCurHit[0]; const ExtHit_t * pCur1 = m_pCurHit[1]; if ( m_uMatchedDocid < pDocs->m_uDocid ) m_uMatchedDocid = 0; int iHit = 0; WORD uNodePos0 = m_dNodePos[0]; WORD uNodePos1 = m_dNodePos[1]; while ( iHitm_uDocid==m_uMatchedDocid ) || ( pCur1 && pCur1->m_uDocid==m_uMatchedDocid ) ) ) { // merge, while possible if ( pCur0 && pCur1 && pCur0->m_uDocid==m_uMatchedDocid && pCur1->m_uDocid==m_uMatchedDocid ) while ( iHitm_uHitpos < pCur1->m_uHitpos ) || ( pCur0->m_uHitpos==pCur1->m_uHitpos && pCur0->m_uQuerypos>pCur1->m_uQuerypos ) ) { m_dHits[iHit] = *pCur0++; if ( uNodePos0!=0 ) m_dHits[iHit++].m_uNodepos = uNodePos0; else iHit++; if ( pCur0->m_uDocid!=m_uMatchedDocid ) break; } else { m_dHits[iHit] = *pCur1++; if ( uNodePos1!=0 ) m_dHits[iHit++].m_uNodepos = uNodePos1; else iHit++; if ( pCur1->m_uDocid!=m_uMatchedDocid ) break; } } // copy tail, while possible, unless the other child is at the end of a hit block if ( pCur0 && pCur0->m_uDocid==m_uMatchedDocid && !( pCur1 && pCur1->m_uDocid==DOCID_MAX ) ) { while ( pCur0->m_uDocid==m_uMatchedDocid && iHitm_uDocid==m_uMatchedDocid && !( pCur0 && pCur0->m_uDocid==DOCID_MAX ) ) { while ( pCur1->m_uDocid==m_uMatchedDocid && iHitm_uDocid!=m_uMatchedDocid && pCur0->m_uDocid!=DOCID_MAX ) && ( pCur1 && pCur1->m_uDocid!=m_uMatchedDocid && pCur1->m_uDocid!=DOCID_MAX ) ) m_uMatchedDocid = 0; // warmup if needed if ( !pCur0 || pCur0->m_uDocid==DOCID_MAX ) pCur0 = m_pChildren[0]->GetHitsChunk ( pDocs, uMaxID ); if ( !pCur1 || pCur1->m_uDocid==DOCID_MAX ) pCur1 = m_pChildren[1]->GetHitsChunk ( pDocs, uMaxID ); // one of the hitlists is over if ( !pCur0 || !pCur1 ) { // if one is over, we might still need to copy the other one. otherwise, skip it if ( ( pCur0 && pCur0->m_uDocid==m_uMatchedDocid ) || ( pCur1 && pCur1->m_uDocid==m_uMatchedDocid ) ) continue; if ( pCur0 ) while ( ( pCur0 = m_pChildren[0]->GetHitsChunk ( pDocs, uMaxID ) )!=NULL ); if ( pCur1 ) while ( ( pCur1 = m_pChildren[1]->GetHitsChunk ( pDocs, uMaxID ) )!=NULL ); if ( !pCur0 && !pCur1 ) break; // both are over, we're done } // find matching doc assert ( pCur1 && pCur0 ); while ( !m_uMatchedDocid ) { while ( pCur0->m_uDocid < pCur1->m_uDocid ) pCur0++; if ( pCur0->m_uDocid==DOCID_MAX ) break; while ( pCur1->m_uDocid < pCur0->m_uDocid ) pCur1++; if ( pCur1->m_uDocid==DOCID_MAX ) break; if ( pCur0->m_uDocid==pCur1->m_uDocid ) m_uMatchedDocid = pCur0->m_uDocid; } } m_pCurHit[0] = pCur0; m_pCurHit[1] = pCur1; assert ( iHit>=0 && iHitGetZVec(iLeft); int * pRight = m_dChildzones[1]->GetZVec(iRight); for ( int i = 0; im_iNumZones; ++i ) if ( pLeft[i]>=0 && pLeft[i]==pRight[i] ) return true; return false; } ////////////////////////////////////////////////////////////////////////// const ExtHit_t * ExtAndZonespanned::GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ) { const ExtHit_t * pCur0 = m_pCurHit[0]; const ExtHit_t * pCur1 = m_pCurHit[1]; if ( m_uMatchedDocid < pDocs->m_uDocid ) m_uMatchedDocid = 0; int iHit = 0; WORD uNodePos0 = m_dNodePos[0]; WORD uNodePos1 = m_dNodePos[1]; while ( iHitm_uDocid==m_uMatchedDocid ) || ( pCur1 && pCur1->m_uDocid==m_uMatchedDocid ) ) ) { // merge, while possible if ( pCur0 && pCur1 && pCur0->m_uDocid==m_uMatchedDocid && pCur1->m_uDocid==m_uMatchedDocid ) while ( iHitm_uHitpos < pCur1->m_uHitpos ) || ( pCur0->m_uHitpos==pCur1->m_uHitpos && pCur0->m_uQuerypos>pCur1->m_uQuerypos ) ) { if ( IsSameZonespan ( pCur0-m_pLastBaseHit[0], pCur1-m_pLastBaseHit[1] ) ) { m_dHits[iHit] = *pCur0; if ( uNodePos0!=0 ) m_dHits[iHit].m_uNodepos = uNodePos0; m_dChildzones[0]->CopyZVecTo ( pCur0-m_pLastBaseHit[0], m_pSpans->GetZVec ( iHit++ ) ); } pCur0++; if ( pCur0->m_uDocid!=m_uMatchedDocid ) break; } else { if ( IsSameZonespan ( pCur0-m_pLastBaseHit[0], pCur1-m_pLastBaseHit[1] ) ) { m_dHits[iHit] = *pCur1; if ( uNodePos1!=0 ) m_dHits[iHit].m_uNodepos = uNodePos1; m_dChildzones[1]->CopyZVecTo ( pCur1-m_pLastBaseHit[1], m_pSpans->GetZVec ( iHit++ ) ); } pCur1++; if ( pCur1->m_uDocid!=m_uMatchedDocid ) break; } } // our special GetDocsChunk made the things so simply, that we doesn't need to care about tail hits at all. // copy tail, while possible, unless the other child is at the end of a hit block if ( pCur0 && pCur0->m_uDocid==m_uMatchedDocid && !( pCur1 && pCur1->m_uDocid==DOCID_MAX ) ) { while ( pCur0->m_uDocid==m_uMatchedDocid && iHitm_uDocid==m_uMatchedDocid && !( pCur0 && pCur0->m_uDocid==DOCID_MAX ) ) { while ( pCur1->m_uDocid==m_uMatchedDocid && iHitm_uDocid!=m_uMatchedDocid && pCur0->m_uDocid!=DOCID_MAX ) && ( pCur1 && pCur1->m_uDocid!=m_uMatchedDocid && pCur1->m_uDocid!=DOCID_MAX ) ) m_uMatchedDocid = 0; // warmup if needed if ( !pCur0 || pCur0->m_uDocid==DOCID_MAX ) { pCur0 = m_pChildren[0]->GetHitsChunk ( pDocs, uMaxID ); m_pLastBaseHit[0] = pCur0; } if ( !pCur1 || pCur1->m_uDocid==DOCID_MAX ) { pCur1 = m_pChildren[1]->GetHitsChunk ( pDocs, uMaxID ); m_pLastBaseHit[1] = pCur1; } // one of the hitlists is over if ( !pCur0 || !pCur1 ) { if ( !pCur0 && !pCur1 ) break; // both are over, we're done // one is over, but we still need to copy the other one m_uMatchedDocid = pCur0 ? pCur0->m_uDocid : pCur1->m_uDocid; assert ( m_uMatchedDocid!=DOCID_MAX ); continue; } // find matching doc assert ( pCur1 && pCur0 ); while ( !m_uMatchedDocid ) { while ( pCur0->m_uDocid < pCur1->m_uDocid ) pCur0++; if ( pCur0->m_uDocid==DOCID_MAX ) break; while ( pCur1->m_uDocid < pCur0->m_uDocid ) pCur1++; if ( pCur1->m_uDocid==DOCID_MAX ) break; if ( pCur0->m_uDocid==pCur1->m_uDocid ) m_uMatchedDocid = pCur0->m_uDocid; } } m_pCurHit[0] = pCur0; m_pCurHit[1] = pCur1; assert ( iHit>=0 && iHitm_uSpanlen + pRight->m_uHitpos - pLeft->m_uHitpos; } ////////////////////////////////////////////////////////////////////////// const ExtDoc_t * ExtOr_c::GetDocsChunk ( SphDocID_t * pMaxID ) { m_uMaxID = 0; const ExtDoc_t * pCur0 = m_pCurDoc[0]; const ExtDoc_t * pCur1 = m_pCurDoc[1]; DWORD uTouched = 0; int iDoc = 0; CSphRowitem * pDocinfo = m_pDocinfo; while ( iDocm_uDocid==DOCID_MAX ) { if ( uTouched & 1 ) break; // it was touched, so we can't advance, because child hitlist offsets would be lost pCur0 = m_pChildren[0]->GetDocsChunk ( NULL ); } if ( !pCur1 || pCur1->m_uDocid==DOCID_MAX ) { if ( uTouched & 2 ) break; // it was touched, so we can't advance, because child hitlist offsets would be lost pCur1 = m_pChildren[1]->GetDocsChunk ( NULL ); } // check if we're over if ( !pCur0 && !pCur1 ) break; // merge lists while we can, copy tail while if we can not if ( pCur0 && pCur1 ) { // merge lists if we have both of them while ( iDocm_uDocid < pCur1->m_uDocid && iDocm_uDocid==DOCID_MAX ) { pCur0 = NULL; break; } // copy min docids from 2nd child while ( pCur1->m_uDocid < pCur0->m_uDocid && iDocm_uDocid==DOCID_MAX ) { pCur1 = NULL; break; } // copy min docids from both children assert ( pCur0->m_uDocid && pCur0->m_uDocid!=DOCID_MAX ); assert ( pCur1->m_uDocid && pCur1->m_uDocid!=DOCID_MAX ); while ( pCur0->m_uDocid==pCur1->m_uDocid && pCur0->m_uDocid!=DOCID_MAX && iDocm_uDocFields | pCur1->m_uDocFields; // not necessary m_dDocs[iDoc].m_fTFIDF = pCur0->m_fTFIDF + pCur1->m_fTFIDF; CopyExtDocinfo ( m_dDocs[iDoc], *pCur0, &pDocinfo, m_iStride ); iDoc++; pCur0++; pCur1++; uTouched |= 3; } if ( pCur0->m_uDocid==DOCID_MAX ) { pCur0 = NULL; break; } if ( pCur1->m_uDocid==DOCID_MAX ) { pCur1 = NULL; break; } } } else { // copy tail if we don't have both lists const ExtDoc_t * pList = pCur0 ? pCur0 : pCur1; if ( pList->m_uDocid!=DOCID_MAX && iDocm_uDocid!=DOCID_MAX && iDocm_uDocid==DOCID_MAX ) pList = NULL; if ( pCur0 ) pCur0 = pList; else pCur1 = pList; } } m_pCurDoc[0] = pCur0; m_pCurDoc[1] = pCur1; return ReturnDocsChunk ( iDoc, pMaxID ); } const ExtHit_t * ExtOr_c::GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ) { const ExtHit_t * pCur0 = m_pCurHit[0]; const ExtHit_t * pCur1 = m_pCurHit[1]; int iHit = 0; while ( iHitm_uDocid==m_uMatchedDocid ) || ( pCur1 && pCur1->m_uDocid==m_uMatchedDocid ) ) ) { // merge, while possible if ( pCur0 && pCur1 && pCur0->m_uDocid==m_uMatchedDocid && pCur1->m_uDocid==m_uMatchedDocid ) while ( iHitm_uHitpos < pCur1->m_uHitpos ) { m_dHits[iHit++] = *pCur0++; if ( pCur0->m_uDocid!=m_uMatchedDocid ) break; } else { m_dHits[iHit++] = *pCur1++; if ( pCur1->m_uDocid!=m_uMatchedDocid ) break; } } // a pretty tricky bit // one of the nodes might have run out of current hits chunk (rather hits at all) // so we need to get the next hits chunk NOW, check for that condition, and keep merging // simply going to tail hits copying is incorrect, it could copy in wrong order // example, word A, pos 1, 2, 3, hit chunk ends, 4, 5, 6, word B, pos 7, 8, 9 if ( !pCur0 || pCur0->m_uDocid==DOCID_MAX ) { pCur0 = m_pChildren[0]->GetHitsChunk ( pDocs, uMaxID ); if ( pCur0 && pCur0->m_uDocid==m_uMatchedDocid ) continue; } if ( !pCur1 || pCur1->m_uDocid==DOCID_MAX ) { pCur1 = m_pChildren[1]->GetHitsChunk ( pDocs, uMaxID ); if ( pCur1 && pCur1->m_uDocid==m_uMatchedDocid ) continue; } // copy tail, while possible if ( pCur0 && pCur0->m_uDocid==m_uMatchedDocid ) { while ( pCur0->m_uDocid==m_uMatchedDocid && iHitm_uDocid==m_uMatchedDocid ); while ( pCur1->m_uDocid==m_uMatchedDocid && iHitm_uDocid!=m_uMatchedDocid ) && ( pCur1 && pCur1->m_uDocid!=m_uMatchedDocid ) ) m_uMatchedDocid = 0; // warmup if needed if ( !pCur0 || pCur0->m_uDocid==DOCID_MAX ) pCur0 = m_pChildren[0]->GetHitsChunk ( pDocs, uMaxID ); if ( !pCur1 || pCur1->m_uDocid==DOCID_MAX ) pCur1 = m_pChildren[1]->GetHitsChunk ( pDocs, uMaxID ); if ( !pCur0 && !pCur1 ) break; m_uMatchedDocid = ( pCur0 && pCur1 ) ? Min ( pCur0->m_uDocid, pCur1->m_uDocid ) : ( pCur0 ? pCur0->m_uDocid : pCur1->m_uDocid ); } m_pCurHit[0] = pCur0; m_pCurHit[1] = pCur1; assert ( iHit>=0 && iHitGetDocsChunk ( pMaxID ); // otherwise, do some removals m_uMaxID = 0; const ExtDoc_t * pCur0 = m_pCurDoc[0]; const ExtDoc_t * pCur1 = m_pCurDoc[1]; int iDoc = 0; CSphRowitem * pDocinfo = m_pDocinfo; while ( iDocm_uDocid==DOCID_MAX ) { // there were matches; we can not pull more because that'd fuckup hitlists if ( iDoc ) break; // no matches so far; go pull pCur0 = m_pChildren[0]->GetDocsChunk ( NULL ); if ( !pCur0 ) break; } // pull more docs from reject, if nedeed if ( !pCur1 || pCur1->m_uDocid==DOCID_MAX ) pCur1 = m_pChildren[1]->GetDocsChunk ( NULL ); // if there's nothing to filter against, simply copy leftovers if ( !pCur1 ) { assert ( pCur0 ); while ( pCur0->m_uDocid!=DOCID_MAX && iDocm_uDocid==DOCID_MAX ) m_bPassthrough = true; break; } // perform filtering assert ( pCur0 ); assert ( pCur1 ); for ( ;; ) { assert ( iDocm_uDocid!=DOCID_MAX ); assert ( pCur1->m_uDocid!=DOCID_MAX ); // copy accepted until min rejected id while ( pCur0->m_uDocid < pCur1->m_uDocid && iDocm_uDocid==DOCID_MAX || iDoc==MAX_DOCS-1 ) break; // skip rejected until min accepted id while ( pCur1->m_uDocid < pCur0->m_uDocid ) pCur1++; if ( pCur1->m_uDocid==DOCID_MAX ) break; // skip both while ids match while ( pCur0->m_uDocid==pCur1->m_uDocid && pCur0->m_uDocid!=DOCID_MAX ) { pCur0++; pCur1++; } if ( pCur0->m_uDocid==DOCID_MAX || pCur1->m_uDocid==DOCID_MAX ) break; } } m_pCurDoc[0] = pCur0; m_pCurDoc[1] = pCur1; return ReturnDocsChunk ( iDoc, pMaxID ); } const ExtHit_t * ExtAndNot_c::GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ) { return m_pChildren[0]->GetHitsChunk ( pDocs, uMaxID ); }; void ExtAndNot_c::Reset ( const ISphQwordSetup & tSetup ) { m_bPassthrough = false; ExtTwofer_c::Reset ( tSetup ); } ////////////////////////////////////////////////////////////////////////// ExtNWayT::ExtNWayT ( const CSphVector & dNodes, const ISphQwordSetup & tSetup ) : m_pNode ( NULL ) , m_pDocs ( NULL ) , m_pHits ( NULL ) , m_pDoc ( NULL ) , m_pHit ( NULL ) , m_pMyDoc ( NULL ) , m_pMyHit ( NULL ) , m_uLastDocID ( 0 ) , m_uMatchedDocid ( 0 ) , m_uHitsOverFor ( 0 ) { assert ( dNodes.GetLength()>1 ); m_iAtomPos = dNodes[0]->m_iAtomPos; m_dMyHits[0].m_uDocid = DOCID_MAX; AllocDocinfo ( tSetup ); } ExtNWayT::~ExtNWayT () { SafeDelete ( m_pNode ); } void ExtNWayT::Reset ( const ISphQwordSetup & tSetup ) { m_pNode->Reset ( tSetup ); m_pDocs = NULL; m_pHits = NULL; m_pDoc = NULL; m_pHit = NULL; m_pMyDoc = NULL; m_pMyHit = NULL; m_uLastDocID = 0; m_uMatchedDocid = 0; m_uHitsOverFor = 0; m_dMyHits[0].m_uDocid = DOCID_MAX; } int ExtNWayT::GetQwords ( ExtQwordsHash_t & hQwords ) { assert ( m_pNode ); return m_pNode->GetQwords ( hQwords ); } void ExtNWayT::SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) { assert ( m_pNode ); m_pNode->SetQwordsIDF ( hQwords ); } void ExtNWayT::GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ) { assert ( m_pNode ); m_pNode->GetTermDupes ( hQwords, dTermDupes ); } uint64_t ExtNWayT::GetWordID() const { assert ( m_pNode ); return m_pNode->GetWordID(); } template < class FSM > const ExtDoc_t * ExtNWay_c::GetDocsChunk ( SphDocID_t * pMaxID ) { m_uMaxID = 0; // initial warmup if ( !m_pDoc ) { if ( !m_pDocs ) m_pDocs = m_pNode->GetDocsChunk ( &m_uDocsMaxID ); if ( !m_pDocs ) return NULL; // no more docs m_pDoc = m_pDocs; } // shortcuts const ExtDoc_t * pDoc = m_pDoc; const ExtHit_t * pHit = m_pHit; FSM::ResetFSM(); // skip leftover hits while ( m_uLastDocID ) { if ( !pHit || pHit->m_uDocid==DOCID_MAX ) { pHit = m_pHits = m_pNode->GetHitsChunk ( m_pDocs, m_uDocsMaxID ); if ( !pHit ) break; } while ( pHit->m_uDocid==m_uLastDocID ) pHit++; if ( pHit->m_uDocid!=DOCID_MAX && pHit->m_uDocid!=m_uLastDocID ) m_uLastDocID = 0; } // search for matches int iDoc = 0; int iHit = 0; CSphRowitem * pDocinfo = m_pDocinfo; while ( iHitm_uDocid==DOCID_MAX ) { // grab more hits pHit = m_pHits = m_pNode->GetHitsChunk ( m_pDocs, m_uDocsMaxID ); if ( m_pHits ) continue; m_uMatchedDocid = 0; // no more hits for current docs chunk; grab more docs pDoc = m_pDocs = m_pNode->GetDocsChunk ( &m_uDocsMaxID ); if ( !m_pDocs ) break; // we got docs, there must be hits pHit = m_pHits = m_pNode->GetHitsChunk ( m_pDocs, m_uDocsMaxID ); assert ( pHit ); continue; } // check if the incoming hit is out of bounds, or affects min pos if ( pHit->m_uDocid!=m_uMatchedDocid ) { m_uMatchedDocid = pHit->m_uDocid; FSM::ResetFSM(); continue; } if ( FSM::HitFSM ( pHit, &m_dMyHits[iHit] ) ) { // emit document, if it's new if ( pHit->m_uDocid!=m_uLastDocID ) { assert ( pDoc->m_uDocid<=pHit->m_uDocid ); while ( pDoc->m_uDocid < pHit->m_uDocid ) pDoc++; assert ( pDoc->m_uDocid==pHit->m_uDocid ); m_dDocs[iDoc].m_uDocid = pHit->m_uDocid; m_dDocs[iDoc].m_uDocFields = 1<< ( HITMAN::GetField ( pHit->m_uHitpos ) ); // non necessary m_dDocs[iDoc].m_uHitlistOffset = -1; m_dDocs[iDoc].m_fTFIDF = pDoc->m_fTFIDF; CopyExtDocinfo ( m_dDocs[iDoc], *pDoc, &pDocinfo, m_iStride ); iDoc++; m_uLastDocID = pHit->m_uDocid; } iHit++; } // go on pHit++; } // reset current positions for hits chunk getter m_pMyDoc = m_dDocs; m_pMyHit = m_dMyHits; // save shortcuts m_pDoc = pDoc; m_pHit = pHit; assert ( iHit>=0 && iHit const ExtHit_t * ExtNWay_c::GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ) { // if we already emitted hits for this matches block, do not do that again SphDocID_t uFirstMatch = pDocs->m_uDocid; if ( uFirstMatch==m_uHitsOverFor ) return NULL; // early reject whole block if ( pDocs->m_uDocid > m_uMaxID ) return NULL; if ( m_uMaxID && m_dDocs[0].m_uDocid > uMaxID ) return NULL; // shortcuts const ExtDoc_t * pMyDoc = m_pMyDoc; const ExtHit_t * pMyHit = m_pMyHit; assert ( pMyDoc ); assert ( pMyHit ); // filter and copy hits from m_dMyHits int iHit = 0; while ( iHitm_uDocid < pDocs->m_uDocid ) pMyDoc++; if ( pMyDoc->m_uDocid==DOCID_MAX ) break; while ( pDocs->m_uDocid < pMyDoc->m_uDocid ) pDocs++; if ( pDocs->m_uDocid==DOCID_MAX ) break; } while ( pDocs->m_uDocid!=pMyDoc->m_uDocid ); if ( pDocs->m_uDocid!=pMyDoc->m_uDocid ) { assert ( pMyDoc->m_uDocid==DOCID_MAX || pDocs->m_uDocid==DOCID_MAX ); break; } assert ( pDocs->m_uDocid==pMyDoc->m_uDocid ); assert ( pDocs->m_uDocid!=0 ); assert ( pDocs->m_uDocid!=DOCID_MAX ); m_uMatchedDocid = pDocs->m_uDocid; } // skip until we have to while ( pMyHit->m_uDocid < m_uMatchedDocid ) pMyHit++; // copy while we can if ( pMyHit->m_uDocid!=DOCID_MAX ) { assert ( pMyHit->m_uDocid==m_uMatchedDocid ); assert ( m_uMatchedDocid!=0 && m_uMatchedDocid!=DOCID_MAX ); while ( pMyHit->m_uDocid==m_uMatchedDocid && iHitm_uDocid!=m_uMatchedDocid && pMyHit->m_uDocid!=DOCID_MAX ) { // it's simply next document in the line; switch to it m_uMatchedDocid = 0; pMyDoc++; } else if ( pMyHit->m_uDocid==DOCID_MAX && !m_pHit ) { // it's the end break; } else if ( pMyHit->m_uDocid==DOCID_MAX && m_pHit && iHitm_uDocid ); assert ( m_uMatchedDocid==m_uLastDocID ); assert ( !m_pDoc || m_uMatchedDocid==m_pDoc->m_uDocid ); m_pMyDoc = pMyDoc; if ( EmitTail(iHit) ) m_uHitsOverFor = uFirstMatch; pMyDoc = m_pMyDoc; } } // save shortcuts m_pMyDoc = pMyDoc; m_pMyHit = pMyHit; assert ( iHit>=0 && iHit bool ExtNWay_c::EmitTail ( int & iHit ) { const ExtHit_t * pHit = m_pHit; const ExtDoc_t * pMyDoc = m_pMyDoc; bool bTailFinished = false; while ( iHitm_uDocid==DOCID_MAX ) { pHit = m_pHits = m_pNode->GetHitsChunk ( m_pDocs, m_uDocsMaxID ); if ( !pHit ) { m_uMatchedDocid = 0; pMyDoc++; break; } } // stop and finish on the first new id if ( pHit->m_uDocid!=m_uMatchedDocid ) { // reset hits getter; this docs chunk from above is finally over bTailFinished = true; m_uMatchedDocid = 0; pMyDoc++; break; } if ( FSM::HitFSM ( pHit, &m_dHits[iHit] ) ) iHit++; pHit++; } // save shortcut m_pHit = pHit; m_pMyDoc = pMyDoc; return bTailFinished; } ////////////////////////////////////////////////////////////////////////// FSMphrase::FSMphrase ( const CSphVector & dQwords, const XQNode_t & , const ISphQwordSetup & ) : m_dAtomPos ( dQwords.GetLength() ) { ARRAY_FOREACH ( i, dQwords ) m_dAtomPos[i] = dQwords[i]->m_iAtomPos; assert ( ( m_dAtomPos.Last()-m_dAtomPos[0]+1 )>0 ); m_dQposDelta.Resize ( m_dAtomPos.Last()-m_dAtomPos[0]+1 ); ARRAY_FOREACH ( i, m_dQposDelta ) m_dQposDelta[i] = -INT_MAX; for ( int i=1; i<(int)m_dAtomPos.GetLength(); i++ ) m_dQposDelta [ dQwords[i-1]->m_iAtomPos - dQwords[0]->m_iAtomPos ] = dQwords[i]->m_iAtomPos - dQwords[i-1]->m_iAtomPos; } inline bool FSMphrase::HitFSM ( const ExtHit_t* pHit, ExtHit_t* dTarget ) { int iHitpos = HITMAN::GetLCS ( pHit->m_uHitpos ); // adding start state for start hit if ( pHit->m_uQuerypos==m_dAtomPos[0] ) { State_t & tState = m_dStates.Add(); tState.m_iTagQword = 0; tState.m_iExpHitpos = iHitpos + m_dQposDelta[0]; } // updating states for ( int i=m_dStates.GetLength()-1; i>=0; i-- ) { if ( m_dStates[i].m_iExpHitposm_uQuerypos ) { m_dStates[i].m_iTagQword++; // check for next elm in query m_dStates[i].m_iExpHitpos = iHitpos + m_dQposDelta [ pHit->m_uQuerypos - m_dAtomPos[0] ]; } // checking if state successfully matched if ( m_dStates[i].m_iTagQword==m_dAtomPos.GetLength()-1 ) { DWORD uSpanlen = m_dAtomPos.Last() - m_dAtomPos[0]; // emit directly into m_dHits, this is no need to disturb m_dMyHits here. dTarget->m_uDocid = pHit->m_uDocid; dTarget->m_uHitpos = iHitpos - uSpanlen; dTarget->m_uQuerypos = (WORD) m_dAtomPos[0]; dTarget->m_uMatchlen = dTarget->m_uSpanlen = (WORD)( uSpanlen + 1 ); dTarget->m_uWeight = m_dAtomPos.GetLength(); ResetFSM (); return true; } } return false; } ////////////////////////////////////////////////////////////////////////// FSMproximity::FSMproximity ( const CSphVector & dQwords, const XQNode_t & tNode, const ISphQwordSetup & ) : m_iMaxDistance ( tNode.m_iOpArg ) , m_uWordsExpected ( dQwords.GetLength() ) , m_uExpPos ( 0 ) { assert ( m_iMaxDistance>0 ); m_uMinQpos = dQwords[0]->m_iAtomPos; m_uQLen = dQwords.Last()->m_iAtomPos - m_uMinQpos; m_dProx.Resize ( m_uQLen+1 ); m_dDeltas.Resize ( m_uQLen+1 ); } inline bool FSMproximity::HitFSM ( const ExtHit_t* pHit, ExtHit_t* dTarget ) { // walk through the hitlist and update context int iQindex = pHit->m_uQuerypos - m_uMinQpos; DWORD uHitpos = HITMAN::GetLCS ( pHit->m_uHitpos ); // check if the word is new if ( m_dProx[iQindex]==UINT_MAX ) m_uWords++; // update the context m_dProx[iQindex] = uHitpos; // check if the incoming hit is out of bounds, or affects min pos if ( uHitpos>=m_uExpPos // out of expected bounds || iQindex==m_iMinQindex ) // or simply affects min pos { m_iMinQindex = iQindex; int iMinPos = uHitpos - m_uQLen - m_iMaxDistance; ARRAY_FOREACH ( i, m_dProx ) if ( m_dProx[i]!=UINT_MAX ) { if ( (int)m_dProx[i]<=iMinPos ) { m_dProx[i] = UINT_MAX; m_uWords--; continue; } if ( m_dProx[i]m_uHitpos - m_dProx[m_iMinQindex] - m_uQLen ) DWORD uMax = 0; ARRAY_FOREACH ( i, m_dProx ) { m_dDeltas[i] = m_dProx[i] - i; uMax = Max ( uMax, m_dProx[i] ); } m_dDeltas.Sort (); DWORD uWeight = 0; int iLast = -INT_MAX; ARRAY_FOREACH ( i, m_dDeltas ) { if ( m_dDeltas[i]==iLast ) uWeight++; else uWeight = 1; iLast = m_dDeltas[i]; } // emit hit dTarget->m_uDocid = pHit->m_uDocid; dTarget->m_uHitpos = Hitpos_t ( m_dProx[m_iMinQindex] ); // !COMMIT strictly speaking this is creation from LCS not value dTarget->m_uQuerypos = (WORD) m_uMinQpos; dTarget->m_uSpanlen = dTarget->m_uMatchlen = (WORD)( uMax-m_dProx[m_iMinQindex]+1 ); dTarget->m_uWeight = uWeight; // remove current min, and force recompue m_dProx[m_iMinQindex] = UINT_MAX; m_iMinQindex = -1; m_uWords--; m_uExpPos = 0; return true; } ////////////////////////////////////////////////////////////////////////// FSMmultinear::FSMmultinear ( const CSphVector & dNodes, const XQNode_t & tNode, const ISphQwordSetup & ) : m_iNear ( tNode.m_iOpArg ) , m_uWordsExpected ( dNodes.GetLength() ) { if ( m_uWordsExpected==2 ) m_bTwofer = true; else { m_dNpos.Reserve ( m_uWordsExpected ); m_dRing.Resize ( m_uWordsExpected ); m_bTwofer = false; } assert ( m_iNear>0 ); } inline bool FSMmultinear::HitFSM ( const ExtHit_t* pHit, ExtHit_t* dTarget ) { // walk through the hitlist and update context DWORD uHitpos = HITMAN::GetLCS ( pHit->m_uHitpos ); WORD uNpos = pHit->m_uNodepos; WORD uQpos = pHit->m_uQuerypos; // skip dupe hit (may be emitted by OR node, for example) if ( m_uLastP==uHitpos ) { // lets choose leftmost (in query) from all dupes. 'a NEAR/2 a' case if ( m_bTwofer && uNpos( m_dNpos.BinarySearch ( uNpos ) ); if ( !p ) { p = const_cast( m_dNpos.BinarySearch ( m_dRing [ RingTail() ].m_uNodepos ) ); *p = uNpos; m_dNpos.Sort(); m_dRing [ RingTail() ].m_uNodepos = uNpos; m_dRing [ RingTail() ].m_uQuerypos = uQpos; } return false; } else if ( m_uPrelastP && m_uLastML < pHit->m_uMatchlen ) // check if the hit is subset of another one { // roll back pre-last to check agains this new hit. m_uLastML = m_uPrelastML; m_uLastSL = m_uPrelastSL; m_uFirstHit = m_uLastP = m_uPrelastP; m_uWeight = m_uWeight - m_uLastW + m_uPrelastW; } else return false; } // probably new chain if ( m_uLastP==0 || ( m_uLastP + m_uLastML + m_iNear )<=uHitpos ) { m_uFirstHit = m_uLastP = uHitpos; m_uLastML = pHit->m_uMatchlen; m_uLastSL = pHit->m_uSpanlen; m_uWeight = m_uLastW = pHit->m_uWeight; if ( m_bTwofer ) { m_uFirstQpos = uQpos; m_uFirstNpos = uNpos; } else { m_dNpos.Resize(1); m_dNpos[0] = uNpos; Add2Ring ( pHit ); } return false; } // this hit (with such querypos) already was there. Skip the hit. if ( m_bTwofer ) { // special case for twofer: hold the overlapping if ( ( m_uFirstHit + m_uLastML )>uHitpos && ( m_uFirstHit + m_uLastML )<( uHitpos + pHit->m_uMatchlen ) && m_uLastML!=pHit->m_uMatchlen ) { m_uFirstHit = m_uLastP = uHitpos; m_uLastML = pHit->m_uMatchlen; m_uLastSL = pHit->m_uSpanlen; m_uWeight = m_uLastW = pHit->m_uWeight; m_uFirstQpos = uQpos; m_uFirstNpos = uNpos; return false; } if ( uNpos==m_uFirstNpos ) { if ( m_uLastP < uHitpos ) { m_uPrelastML = m_uLastML; m_uPrelastSL = m_uLastSL; m_uPrelastP = m_uLastP; m_uPrelastW = pHit->m_uWeight; m_uFirstHit = m_uLastP = uHitpos; m_uLastML = pHit->m_uMatchlen; m_uLastSL = pHit->m_uSpanlen; m_uWeight = m_uLastW = m_uPrelastW; m_uFirstQpos = uQpos; m_uFirstNpos = uNpos; } return false; } } else { if ( uNpos < m_dNpos[0] ) { m_uFirstQpos = Min ( m_uFirstQpos, uQpos ); m_dNpos.Insert ( 0, uNpos ); } else if ( uNpos > m_dNpos.Last() ) { m_uFirstQpos = Min ( m_uFirstQpos, uQpos ); m_dNpos.Add ( uNpos ); } else if ( uNpos!=m_dNpos[0] && uNpos!=m_dNpos.Last() ) { int iEnd = m_dNpos.GetLength(); int iStart = 0; int iMid = -1; while ( iEnd-iStart>1 ) { iMid = ( iStart + iEnd ) / 2; if ( uNpos==m_dNpos[iMid] ) { const ExtHit_t& dHit = m_dRing[m_iRing]; // last addition same as the first. So, we can shift if ( uNpos==dHit.m_uNodepos ) { m_uWeight -= dHit.m_uWeight; m_uFirstHit = HITMAN::GetLCS ( dHit.m_uHitpos ); ShiftRing(); // last addition same as the first. So, we can shift } else if ( uNpos==m_dRing [ RingTail() ].m_uNodepos ) m_uWeight -= m_dRing [ RingTail() ].m_uWeight; else return false; } if ( uNposm_uWeight; m_uLastML = pHit->m_uMatchlen; m_uLastSL = pHit->m_uSpanlen; Add2Ring ( pHit ); // finally got the whole chain - emit it! // warning: we don't support overlapping in generic chains. if ( m_bTwofer || (int)m_uWordsExpected==m_dNpos.GetLength() ) { dTarget->m_uDocid = pHit->m_uDocid; dTarget->m_uHitpos = Hitpos_t ( m_uFirstHit ); // !COMMIT strictly speaking this is creation from LCS not value dTarget->m_uMatchlen = (WORD)( uHitpos - m_uFirstHit + m_uLastML ); dTarget->m_uWeight = m_uWeight; m_uPrelastP = 0; if ( m_bTwofer ) // for exactly 2 words allow overlapping - so, just shift the chain, not reset it { dTarget->m_uQuerypos = Min ( m_uFirstQpos, pHit->m_uQuerypos ); dTarget->m_uSpanlen = 2; m_uFirstHit = m_uLastP = uHitpos; m_uWeight = pHit->m_uWeight; m_uFirstQpos = pHit->m_uQuerypos; } else { dTarget->m_uQuerypos = Min ( m_uFirstQpos, pHit->m_uQuerypos ); dTarget->m_uSpanlen = (WORD) m_dNpos.GetLength(); m_uLastP = 0; } return true; } m_uLastP = uHitpos; return false; } ////////////////////////////////////////////////////////////////////////// struct QuorumDupeNodeHash_t { uint64_t m_uWordID; int m_iIndex; bool operator < ( const QuorumDupeNodeHash_t & b ) const { if ( m_uWordID==b.m_uWordID ) return m_iIndex & dQwords, const XQNode_t & tNode, const ISphQwordSetup & tSetup ) { assert ( tNode.GetOp()==SPH_QUERY_QUORUM ); assert ( dQwords.GetLength()1 ); // use TERM instead assert ( dQwords.GetLength()<=256 ); // internal masks are upto 256 bits assert ( m_iThresh>=1 ); // 1 is also OK; it's a bit different from just OR assert ( m_iThresh0 ) { m_iAtomPos = dQwords[0]->m_iAtomPos; // compute duplicate keywords mask (aka dupe mask) // FIXME! will (likely) now fail with quorum+expand+dupes, need a new test? // FIXME! will fail with wordforms and stuff; sorry, no wordforms vs expand vs quorum support for now! CSphFixedVector dHashes ( dQwords.GetLength() ); ARRAY_FOREACH ( i, dQwords ) { dHashes[i].m_uWordID = dQwords[i]->GetWordID(); dHashes[i].m_iIndex = i; } sphSort ( dHashes.Begin(), dHashes.GetLength() ); QuorumDupeNodeHash_t tParent = *dHashes.Begin(); m_dInitialChildren.Add().m_pTerm = dQwords[tParent.m_iIndex]; m_dInitialChildren.Last().m_iCount = 1; tParent.m_iIndex = 0; for ( int i=1; iReset ( tSetup ); } int ExtQuorum_c::GetQwords ( ExtQwordsHash_t & hQwords ) { int iMax = -1; ARRAY_FOREACH ( i, m_dChildren ) { int iKidMax = m_dChildren[i].m_pTerm->GetQwords ( hQwords ); iMax = Max ( iMax, iKidMax ); } return iMax; } void ExtQuorum_c::SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) { ARRAY_FOREACH ( i, m_dChildren ) m_dChildren[i].m_pTerm->SetQwordsIDF ( hQwords ); } void ExtQuorum_c::GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ) { ARRAY_FOREACH ( i, m_dChildren ) m_dChildren[i].m_pTerm->GetTermDupes ( hQwords, dTermDupes ); } uint64_t ExtQuorum_c::GetWordID() const { uint64_t uHash = 0; ARRAY_FOREACH ( i, m_dChildren ) uHash ^= m_dChildren[i].m_pTerm->GetWordID(); return uHash; } const ExtDoc_t * ExtQuorum_c::GetDocsChunk ( SphDocID_t * pMaxID ) { // warmup int iQuorumLeft = CountQuorum ( true ); ARRAY_FOREACH ( i, m_dChildren ) { TermTuple_t & tElem = m_dChildren[i]; tElem.m_bStandStill = false; // clear this-round-match flag if ( tElem.m_pCurDoc && tElem.m_pCurDoc->m_uDocid!=DOCID_MAX ) continue; tElem.m_pCurDoc = tElem.m_pTerm->GetDocsChunk ( pMaxID ); if ( tElem.m_pCurDoc ) continue; m_dChildren.RemoveFast ( i ); i--; iQuorumLeft = CountQuorum ( true ); if ( iQuorumLeft=m_iThresh && !bProceed2HitChunk ) { // find min document ID, count occurrences ExtDoc_t tCand; tCand.m_uDocid = DOCID_MAX; // current candidate id tCand.m_uHitlistOffset = 0; // suppress gcc warnings tCand.m_pDocinfo = NULL; tCand.m_uDocFields = 0; // non necessary tCand.m_fTFIDF = 0.0f; int iQuorum = 0; ARRAY_FOREACH ( i, m_dChildren ) { TermTuple_t & tElem = m_dChildren[i]; assert ( tElem.m_pCurDoc->m_uDocid && tElem.m_pCurDoc->m_uDocid!=DOCID_MAX ); if ( tElem.m_pCurDoc->m_uDocid < tCand.m_uDocid ) { tCand = *tElem.m_pCurDoc; iQuorum = tElem.m_iCount; } else if ( tElem.m_pCurDoc->m_uDocid==tCand.m_uDocid ) { tCand.m_uDocFields |= tElem.m_pCurDoc->m_uDocFields; // FIXME!!! check hits in case of dupes or field constrain tCand.m_fTFIDF += tElem.m_pCurDoc->m_fTFIDF; iQuorum += tElem.m_iCount; } } // FIXME!!! check that tail hits should be processed right after CollectMatchingHits if ( iQuorum>=m_iThresh && ( !m_bHasDupes || CollectMatchingHits ( tCand.m_uDocid, m_iThresh ) ) ) { CopyExtDoc ( m_dDocs[iDoc++], tCand, &pDocinfo, m_iStride ); // FIXME!!! move to children advancing ARRAY_FOREACH ( i, m_dChildren ) { if ( m_dChildren[i].m_pCurDoc->m_uDocid==tCand.m_uDocid ) m_dChildren[i].m_bStandStill = true; } } // advance children int iWasChildren = m_dChildren.GetLength(); ARRAY_FOREACH ( i, m_dChildren ) { TermTuple_t & tElem = m_dChildren[i]; if ( tElem.m_pCurDoc->m_uDocid!=tCand.m_uDocid ) continue; tElem.m_pCurDoc++; if ( tElem.m_pCurDoc->m_uDocid!=DOCID_MAX ) continue; // should grab hits first if ( tElem.m_bStandStill ) { bProceed2HitChunk = true; continue; // still should fast forward rest of children to pass current doc-id } tElem.m_pCurDoc = tElem.m_pTerm->GetDocsChunk ( NULL ); if ( tElem.m_pCurDoc ) continue; m_dChildren.RemoveFast ( i ); i--; } if ( iWasChildren!=m_dChildren.GetLength() ) iQuorumLeft = CountQuorum ( false ); } return ReturnDocsChunk ( iDoc, pMaxID ); } const ExtHit_t * ExtQuorum_c::GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ) { // dupe tail hits if ( m_bHasDupes && m_uMatchedDocid ) return GetHitsChunkDupesTail(); // quorum-buffer path if ( m_bHasDupes ) return GetHitsChunkDupes ( pDocs, uMaxID ); return GetHitsChunkSimple ( pDocs, uMaxID ); } const ExtHit_t * ExtQuorum_c::GetHitsChunkDupesTail () { ExtDoc_t dTailDocs[2]; dTailDocs[0].m_uDocid = m_uMatchedDocid; dTailDocs[1].m_uDocid = DOCID_MAX; int iHit = 0; while ( iHitm_uDocid==m_uMatchedDocid && HITMAN::GetLCS ( pCurHit->m_uHitpos ) < uMinPos ) { uMinPos = HITMAN::GetLCS ( pCurHit->m_uHitpos ); iMinChild = i; } } // no hits found at children if ( iMinChild<0 ) { m_uMatchedDocid = 0; break; } m_dHits[iHit++] = *m_dChildren[iMinChild].m_pCurHit; m_dChildren[iMinChild].m_pCurHit++; if ( m_dChildren[iMinChild].m_pCurHit->m_uDocid==DOCID_MAX ) m_dChildren[iMinChild].m_pCurHit = m_dChildren[iMinChild].m_pTerm->GetHitsChunk ( dTailDocs, m_uMatchedDocid ); } return ReturnHitsChunk ( iHit ); } struct QuorumCmpHitPos_fn { inline bool IsLess ( const ExtHit_t & a, const ExtHit_t & b ) const { return HITMAN::GetLCS ( a.m_uHitpos )m_uDocid!=DOCID_MAX ) { SphDocID_t uDocid = pDocs->m_uDocid; while ( m_dQuorumHits[m_iMyLast].m_uDocidm_uDocid==uDocid && HITMAN::GetLCS ( pCurHit->m_uHitpos ) < uMinPos ) { uMinPos = HITMAN::GetLCS ( pCurHit->m_uHitpos ); iMinChild = i; } } } // no hits found at children and quorum-buffer over if ( iMinChild<0 && m_iMyLast==iMyEnd ) { pDocs++; bCheckChildren = false; break; } if ( iMinChild<0 ) { m_dHits[iHit++] = m_dQuorumHits[m_iMyLast++]; } else { m_dHits[iHit++] = *m_dChildren[iMinChild].m_pCurHit; m_dChildren[iMinChild].m_pCurHit++; if ( m_dChildren[iMinChild].m_pCurHit->m_uDocid==DOCID_MAX ) m_dChildren[iMinChild].m_pCurHit = m_dChildren[iMinChild].m_pTerm->GetHitsChunk ( pDocs, uMaxID ); } } if ( m_iMyLast==iMyEnd && bCheckChildren ) // quorum-buffer over but children still have hits m_uMatchedDocid = uDocid; } return ReturnHitsChunk ( iHit ); } const ExtHit_t * ExtQuorum_c::GetHitsChunkSimple ( const ExtDoc_t * pDocs, SphDocID_t uMaxID ) { // warmup bool bHasHits = false; ARRAY_FOREACH ( i, m_dChildren ) { TermTuple_t & tElem = m_dChildren[i]; if ( !tElem.m_pCurHit || tElem.m_pCurHit->m_uDocid==DOCID_MAX ) tElem.m_pCurHit = tElem.m_pTerm->GetHitsChunk ( pDocs, uMaxID ); bHasHits |= ( tElem.m_pCurHit!=NULL ); } if ( !bHasHits ) return ReturnHitsChunk ( 0 ); // main loop int iHit = 0; while ( iHitm_uDocid==m_uMatchedDocid && HITMAN::GetLCS ( pCurHit->m_uHitpos ) < uMinPos ) { uMinPos = HITMAN::GetLCS ( pCurHit->m_uHitpos ); // !COMMIT bench/fix, is LCS right here? iMinChild = i; } } if ( iMinChild<0 ) { SphDocID_t uLastDoc = m_uMatchedDocid; m_uMatchedDocid = 0; while ( pDocs->m_uDocid!=DOCID_MAX && pDocs->m_uDocid<=uLastDoc ) pDocs++; } } // get min common incoming docs and hits doc-id if ( !m_uMatchedDocid ) { bool bDocMatched = false; while ( !bDocMatched && pDocs->m_uDocid!=DOCID_MAX ) { iMinChild = -1; uMinPos = UINT_MAX; ARRAY_FOREACH ( i, m_dChildren ) { TermTuple_t & tElem = m_dChildren[i]; if ( !tElem.m_pCurHit ) continue; // fast forward hits while ( tElem.m_pCurHit->m_uDocidm_uDocid && tElem.m_pCurHit->m_uDocid!=DOCID_MAX ) tElem.m_pCurHit++; if ( tElem.m_pCurHit->m_uDocid==pDocs->m_uDocid ) { bDocMatched = true; if ( HITMAN::GetLCS ( tElem.m_pCurHit->m_uHitpos ) < uMinPos ) { uMinPos = HITMAN::GetLCS ( tElem.m_pCurHit->m_uHitpos ); iMinChild = i; } } // rescan current child if ( tElem.m_pCurHit->m_uDocid==DOCID_MAX ) { tElem.m_pCurHit = tElem.m_pTerm->GetHitsChunk ( pDocs, uMaxID ); i -= ( tElem.m_pCurHit ? 1 : 0 ); } } if ( !bDocMatched ) pDocs++; } assert ( !bDocMatched || pDocs->m_uDocid!=DOCID_MAX ); if ( bDocMatched ) m_uMatchedDocid = pDocs->m_uDocid; else break; } assert ( iMinChild>=0 ); m_dHits[iHit++] = *m_dChildren[iMinChild].m_pCurHit; m_dChildren[iMinChild].m_pCurHit++; if ( m_dChildren[iMinChild].m_pCurHit->m_uDocid==DOCID_MAX ) m_dChildren[iMinChild].m_pCurHit = m_dChildren[iMinChild].m_pTerm->GetHitsChunk ( pDocs, uMaxID ); } return ReturnHitsChunk ( iHit ); } int ExtQuorum_c::GetThreshold ( const XQNode_t & tNode, int iQwords ) { return ( tNode.m_bPercentOp ? (int)floor ( 1.0f / 100.0f * tNode.m_iOpArg * iQwords + 0.5f ) : tNode.m_iOpArg ); } bool ExtQuorum_c::CollectMatchingHits ( SphDocID_t uDocid, int iThreshold ) { assert ( m_dQuorumHits+m_iMyHitCount+CountQuorum ( false )m_uDocid==DOCID_MAX ) tElem.m_pCurHit = tElem.m_pTerm->GetHitsChunk ( tElem.m_pCurDoc, uDocid ); // that hit stream over for now if ( !tElem.m_pCurHit || tElem.m_pCurHit->m_uDocid==DOCID_MAX ) { iTermHits = 0; continue; } while ( tElem.m_pCurHit->m_uDocid!=DOCID_MAX && tElem.m_pCurHit->m_uDocidm_uDocid==uDocid && iTermHits=iThreshold ) break; // there might be tail hits - rescan current child if ( tElem.m_pCurHit->m_uDocid==DOCID_MAX ) { i--; } else { iTermHits = 0; } } // discard collected hits is case of no quorum matched if ( iQuorumm_uDocid==DOCID_MAX ) tElem.m_pCurHit = tElem.m_pTerm->GetHitsChunk ( tElem.m_pCurDoc, uDocid ); // hit stream over for current term if ( !tElem.m_pCurHit || tElem.m_pCurHit->m_uDocid==DOCID_MAX ) continue; // collect all hits while ( tElem.m_pCurHit->m_uDocid==uDocid && pHitBuf=m_dQuorumHits+MAX_HITS ) break; // there might be tail hits - rescan current child if ( tElem.m_pCurHit->m_uDocid==DOCID_MAX ) i--; } m_iMyHitCount = pHitBuf - m_dQuorumHits; return true; } ////////////////////////////////////////////////////////////////////////// ExtOrder_c::ExtOrder_c ( const CSphVector & dChildren, const ISphQwordSetup & tSetup ) : m_dChildren ( dChildren ) , m_bDone ( false ) , m_uHitsOverFor ( 0 ) { int iChildren = dChildren.GetLength(); assert ( iChildren>=2 ); m_pDocs.Resize ( iChildren ); m_pHits.Resize ( iChildren ); m_pDocsChunk.Resize ( iChildren ); m_dMaxID.Resize ( iChildren ); m_dMyHits[0].m_uDocid = DOCID_MAX; if ( dChildren.GetLength()>0 ) m_iAtomPos = dChildren[0]->m_iAtomPos; ARRAY_FOREACH ( i, dChildren ) { assert ( m_dChildren[i] ); m_pDocs[i] = NULL; m_pHits[i] = NULL; } AllocDocinfo ( tSetup ); } void ExtOrder_c::Reset ( const ISphQwordSetup & tSetup ) { m_bDone = false; m_uHitsOverFor = 0; m_dMyHits[0].m_uDocid = DOCID_MAX; ARRAY_FOREACH ( i, m_dChildren ) { assert ( m_dChildren[i] ); m_dChildren[i]->Reset ( tSetup ); m_pDocs[i] = NULL; m_pHits[i] = NULL; } } ExtOrder_c::~ExtOrder_c () { ARRAY_FOREACH ( i, m_dChildren ) SafeDelete ( m_dChildren[i] ); } int ExtOrder_c::GetNextHit ( SphDocID_t uDocid ) { // OPTIMIZE! implement PQ instead of full-scan DWORD uMinPos = UINT_MAX; int iChild = -1; ARRAY_FOREACH ( i, m_dChildren ) { // is this child over? if ( !m_pHits[i] ) continue; // skip until proper hit while ( m_pHits[i]->m_uDocid!=DOCID_MAX && m_pHits[i]->m_uDocidm_uDocid==DOCID_MAX ) { // hits and docs over here if ( !m_pDocsChunk[i] ) return -1; m_pHits[i] = m_dChildren[i]->GetHitsChunk ( m_pDocsChunk[i], m_dMaxID[i] ); i--; continue; } // is this our man at all? if ( m_pHits[i]->m_uDocid==uDocid ) { // is he the best we can get? if ( HITMAN::GetLCS ( m_pHits[i]->m_uHitpos ) < uMinPos ) { uMinPos = HITMAN::GetLCS ( m_pHits[i]->m_uHitpos ); iChild = i; } } } return iChild; } int ExtOrder_c::GetMatchingHits ( SphDocID_t uDocid, ExtHit_t * pHitbuf, int iLimit ) { // my trackers CSphVector dAccLongest; CSphVector dAccRecent; int iPosLongest = 0; // needed to handle cases such as "a b c" << a int iPosRecent = 0; int iField = -1; dAccLongest.Reserve ( m_dChildren.GetLength() ); dAccRecent.Reserve ( m_dChildren.GetLength() ); // while there's enough space in the buffer int iMyHit = 0; while ( iMyHit+m_dChildren.GetLength()m_uDocid==uDocid ); // most recent subseq must never be longer assert ( dAccRecent.GetLength()<=dAccLongest.GetLength() ); // handle that hit! int iHitField = HITMAN::GetField ( pHit->m_uHitpos ); int iHitPos = HITMAN::GetPos ( pHit->m_uHitpos ); if ( iHitField!=iField ) { // new field; reset both trackers dAccLongest.Resize ( 0 ); dAccRecent.Resize ( 0 ); // initial seeding, if needed if ( iChild==0 ) { dAccLongest.Add ( *pHit ); iPosLongest = iHitPos + pHit->m_uSpanlen; iField = iHitField; } } else if ( iChild==dAccLongest.GetLength() && iHitPos>=iPosLongest ) { // it fits longest tracker dAccLongest.Add ( *pHit ); iPosLongest = iHitPos + pHit->m_uSpanlen; // fully matched subsequence if ( dAccLongest.GetLength()==m_dChildren.GetLength() ) { // flush longest tracker into buffer, and keep it terminated ARRAY_FOREACH ( i, dAccLongest ) pHitbuf[iMyHit++] = dAccLongest[i]; // reset both trackers dAccLongest.Resize ( 0 ); dAccRecent.Resize ( 0 ); iPosRecent = iPosLongest; } } else if ( iChild==0 ) { // it restarts most-recent tracker dAccRecent.Resize ( 0 ); dAccRecent.Add ( *pHit ); iPosRecent = iHitPos + pHit->m_uSpanlen; if ( !dAccLongest.GetLength() ) { dAccLongest.Add ( *pHit ); iPosLongest = iHitPos + pHit->m_uSpanlen; } } else if ( iChild==dAccRecent.GetLength() && iHitPos>=iPosRecent ) { // it fits most-recent tracker dAccRecent.Add ( *pHit ); iPosRecent = iHitPos + pHit->m_uSpanlen; // maybe most-recent just became longest too? if ( dAccRecent.GetLength()==dAccLongest.GetLength() ) { dAccLongest.SwapData ( dAccRecent ); dAccRecent.Resize ( 0 ); iPosLongest = iPosRecent; } } // advance hit stream m_pHits[iChild]++; } assert ( iMyHit>=0 && iMyHitGetDocsChunk ( &m_dMaxID[i] ); if ( !m_pDocs[i] ) { m_bDone = true; return NULL; } } // match, while there's enough space in buffers CSphRowitem * pDocinfo = m_pDocinfo; int iDoc = 0; int iMyHit = 0; while ( iDocm_uDocid; assert ( uDocid!=DOCID_MAX ); for ( int i=1; im_uDocid < uDocid ) m_pDocs[i]++; // block end marker? pull next block and keep scanning if ( m_pDocs[i]->m_uDocid==DOCID_MAX ) { m_pDocs[i] = m_pDocsChunk[i] = m_dChildren[i]->GetDocsChunk ( &m_dMaxID[i] ); if ( !m_pDocs[i] ) { m_bDone = true; return ReturnDocsChunk ( iDoc, pMaxID ); } continue; } // too big id? its out next candidate if ( m_pDocs[i]->m_uDocid > uDocid ) { uDocid = m_pDocs[i]->m_uDocid; i = 0; continue; } assert ( m_pDocs[i]->m_uDocid==uDocid ); i++; } #ifndef NDEBUG assert ( uDocid!=DOCID_MAX ); ARRAY_FOREACH ( i, m_dChildren ) { assert ( m_pDocs[i] ); assert ( m_pDocs[i]->m_uDocid==uDocid ); } #endif // prefetch hits ARRAY_FOREACH ( i, m_dChildren ) { if ( !m_pHits[i] ) m_pHits[i] = m_dChildren[i]->GetHitsChunk ( m_pDocsChunk[i], m_dMaxID[i] ); // every document comes with at least one hit // and we did not yet process current candidate's hits // so we MUST have hits at this point no matter what assert ( m_pHits[i] ); } // match and save hits int iGotHits = GetMatchingHits ( uDocid, m_dMyHits+iMyHit, MAX_HITS-1-iMyHit ); if ( iGotHits ) { CopyExtDoc ( m_dDocs[iDoc++], *m_pDocs[0], &pDocinfo, m_iStride ); iMyHit += iGotHits; } // advance doc stream m_pDocs[0]++; if ( m_pDocs[0]->m_uDocid==DOCID_MAX ) { m_pDocs[0] = m_pDocsChunk[0] = m_dChildren[0]->GetDocsChunk ( &m_dMaxID[0] ); if ( !m_pDocs[0] ) { m_bDone = true; break; } } } return ReturnDocsChunk ( iDoc, pMaxID ); } const ExtHit_t * ExtOrder_c::GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t ) { if ( pDocs->m_uDocid==m_uHitsOverFor ) return NULL; // copy accumulated hits while we can SphDocID_t uFirstMatch = pDocs->m_uDocid; const ExtHit_t * pMyHits = m_dMyHits; int iHit = 0; for ( ;; ) { while ( pDocs->m_uDocid!=pMyHits->m_uDocid ) { while ( pDocs->m_uDocid < pMyHits->m_uDocid ) pDocs++; if ( pDocs->m_uDocid==DOCID_MAX ) break; while ( pMyHits->m_uDocid < pDocs->m_uDocid ) pMyHits++; if ( pMyHits->m_uDocid==DOCID_MAX ) break; } if ( pDocs->m_uDocid==DOCID_MAX || pMyHits->m_uDocid==DOCID_MAX ) break; assert ( pDocs->m_uDocid==pMyHits->m_uDocid ); while ( pDocs->m_uDocid==pMyHits->m_uDocid ) m_dHits[iHit++] = *pMyHits++; assert ( iHitm_uDocid==DOCID_MAX ) { // ...all of them! setup the next run to check for trailing hits m_dMyHits[0].m_uDocid = DOCID_MAX; } else { // ...but not all of them! we ran out of docs earlier; hence, trailing hits are of no interest m_uHitsOverFor = uFirstMatch; } } else { // we did not copy any hits; check for trailing ones as the last resort if ( pDocs->m_uDocid!=DOCID_MAX ) { iHit = GetMatchingHits ( pDocs->m_uDocid, m_dHits, MAX_HITS-1 ); } if ( !iHit ) { // actually, not *only* in this case, also in partial buffer case // but for simplicity, lets just run one extra GetHitsChunk() iteration m_uHitsOverFor = uFirstMatch; } } // all done assert ( iHitGetQwords ( hQwords ); iMax = Max ( iMax, iKidMax ); } return iMax; } void ExtOrder_c::SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) { ARRAY_FOREACH ( i, m_dChildren ) m_dChildren[i]->SetQwordsIDF ( hQwords ); } void ExtOrder_c::GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ) { ARRAY_FOREACH ( i, m_dChildren ) m_dChildren[i]->GetTermDupes ( hQwords, dTermDupes ); } uint64_t ExtOrder_c::GetWordID () const { uint64_t uHash = 0; ARRAY_FOREACH ( i, m_dChildren ) uHash ^= m_dChildren[i]->GetWordID(); return uHash; } ////////////////////////////////////////////////////////////////////////// ExtUnit_c::ExtUnit_c ( ExtNode_i * pFirst, ExtNode_i * pSecond, const CSphSmallBitvec& uFields, const ISphQwordSetup & tSetup, const char * sUnit ) { m_pArg1 = pFirst; m_pArg2 = pSecond; XQKeyword_t tDot; tDot.m_sWord = sUnit; m_pDot = new ExtTerm_c ( CreateQueryWord ( tDot, tSetup ), uFields, tSetup, true ); m_uHitsOverFor = 0; m_uTailDocid = 0; m_uTailSentenceEnd = 0; m_pDocs1 = NULL; m_pDocs2 = NULL; m_pDotDocs = NULL; m_pDoc1 = NULL; m_pDoc2 = NULL; m_pDotDoc = NULL; m_pHit1 = NULL; m_pHit2 = NULL; m_pDotHit = NULL; m_dMyHits[0].m_uDocid = DOCID_MAX; } ExtUnit_c::~ExtUnit_c () { SafeDelete ( m_pArg1 ); SafeDelete ( m_pArg2 ); SafeDelete ( m_pDot ); } void ExtUnit_c::Reset ( const ISphQwordSetup & tSetup ) { m_pArg1->Reset ( tSetup ); m_pArg2->Reset ( tSetup ); m_pDot->Reset ( tSetup ); m_uHitsOverFor = 0; m_uTailDocid = 0; m_uTailSentenceEnd = 0; m_pDocs1 = NULL; m_pDocs2 = NULL; m_pDotDocs = NULL; m_pDoc1 = NULL; m_pDoc2 = NULL; m_pDotDoc = NULL; m_pHit1 = NULL; m_pHit2 = NULL; m_pDotHit = NULL; m_dMyHits[0].m_uDocid = DOCID_MAX; } int ExtUnit_c::GetQwords ( ExtQwordsHash_t & hQwords ) { int iMax1 = m_pArg1->GetQwords ( hQwords ); int iMax2 = m_pArg2->GetQwords ( hQwords ); return Max ( iMax1, iMax2 ); } void ExtUnit_c::SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) { m_pArg1->SetQwordsIDF ( hQwords ); m_pArg2->SetQwordsIDF ( hQwords ); } void ExtUnit_c::GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ) { m_pArg1->GetTermDupes ( hQwords, dTermDupes ); m_pArg2->GetTermDupes ( hQwords, dTermDupes ); } uint64_t ExtUnit_c::GetWordID() const { uint64_t uHash = m_pArg1->GetWordID(); uHash ^= m_pArg2->GetWordID(); return uHash; } /// skips hits until their docids are less than the given limit static inline void SkipHitsLtDocid ( const ExtHit_t * (*ppHits), SphDocID_t uMatch, ExtNode_i * pNode, const ExtDoc_t * pDocs ) { for ( ;; ) { const ExtHit_t * pHit = *ppHits; if ( !pHit || pHit->m_uDocid==DOCID_MAX ) { pHit = *ppHits = pNode->GetHitsChunk ( pDocs, DOCID_MAX ); // OPTIMIZE? use that max? if ( !pHit ) return; } while ( pHit->m_uDocid < uMatch ) pHit++; *ppHits = pHit; if ( pHit->m_uDocid!=DOCID_MAX ) return; } } /// skips hits within current document while their position is less or equal than the given limit /// returns true if a matching hit (with big enough position, and in current document) was found /// returns false otherwise static inline bool SkipHitsLtePos ( const ExtHit_t * (*ppHits), Hitpos_t uPos, ExtNode_i * pNode, const ExtDoc_t * pDocs ) { SphDocID_t uDocid = (*ppHits)->m_uDocid; for ( ;; ) { const ExtHit_t * pHit = *ppHits; if ( !pHit || pHit->m_uDocid==DOCID_MAX ) { pHit = *ppHits = pNode->GetHitsChunk ( pDocs, DOCID_MAX ); // OPTIMIZE? use that max? if ( !pHit ) return false; } while ( pHit->m_uDocid==uDocid && pHit->m_uHitpos<=uPos ) pHit++; *ppHits = pHit; if ( pHit->m_uDocid!=DOCID_MAX ) return ( pHit->m_uDocid==uDocid ); } } int ExtUnit_c::FilterHits ( int iMyHit, DWORD uSentenceEnd, SphDocID_t uDocid, int * pDoc ) { while ( iMyHitm_uDocid==uDocid && m_pHit1->m_uHitposm_uDocid==uDocid && m_pHit2->m_uHitposm_uDocid==uDocid && m_pHit2->m_uDocid==uDocid ) continue; // no more in-sentence hits, but perhaps more sentences in this document else break; // document is over } // register document as matching if ( pDoc ) { ExtDoc_t & tDoc = m_dDocs [ (*pDoc)++ ]; tDoc.m_uDocid = m_pDoc1->m_uDocid; tDoc.m_uDocFields = m_pDoc1->m_uDocFields | m_pDoc2->m_uDocFields; // non necessary tDoc.m_uHitlistOffset = -1; tDoc.m_fTFIDF = m_pDoc1->m_fTFIDF + m_pDoc2->m_fTFIDF; tDoc.m_pDocinfo = NULL; // no inline support, sorry pDoc = NULL; // just once } if ( bValid1 && ( !bValid2 || m_pHit1->m_uHitpos < m_pHit2->m_uHitpos ) ) { m_dMyHits[iMyHit++] = *m_pHit1++; if ( m_pHit1->m_uDocid==DOCID_MAX ) m_pHit1 = m_pArg1->GetHitsChunk ( m_pDocs1, DOCID_MAX ); } else { m_dMyHits[iMyHit++] = *m_pHit2++; if ( m_pHit2->m_uDocid==DOCID_MAX ) m_pHit2 = m_pArg2->GetHitsChunk ( m_pDocs2, DOCID_MAX ); } } else { // no sentence matched yet // let's check the next hit pair assert ( m_pHit1->m_uDocid==uDocid ); assert ( m_pHit2->m_uDocid==uDocid ); assert ( m_pDotHit->m_uDocid==uDocid ); // our current hit pair locations DWORD uMin = Min ( m_pHit1->m_uHitpos, m_pHit2->m_uHitpos ); DWORD uMax = Max ( m_pHit1->m_uHitpos, m_pHit2->m_uHitpos ); // skip all dots beyond the min location if ( !SkipHitsLtePos ( &m_pDotHit, uMin, m_pDot, m_pDotDocs ) ) { // we have a match! // moreover, no more dots past min location in current document // copy hits until next document uSentenceEnd = UINT_MAX; continue; } // does the first post-pair-start dot separate our hit pair? if ( m_pDotHit->m_uHitpos < uMax ) { // yes, got an "A dot B" case // rewind candidate hits past this dot, break if current document is over if ( !SkipHitsLtePos ( &m_pHit1, m_pDotHit->m_uHitpos, m_pArg1, m_pDocs1 ) ) break; if ( !SkipHitsLtePos ( &m_pHit2, m_pDotHit->m_uHitpos, m_pArg2, m_pDocs2 ) ) break; continue; } else { // we have a match! // copy hits until next dot if ( !SkipHitsLtePos ( &m_pDotHit, uMax, m_pDot, m_pDotDocs ) ) uSentenceEnd = UINT_MAX; // correction, no next dot, so make it "next document" else uSentenceEnd = m_pDotHit->m_uHitpos; assert ( uSentenceEnd ); } } } m_uTailSentenceEnd = uSentenceEnd; // just in case tail hits loop will happen return iMyHit; } void ExtUnit_c::SkipTailHits () { m_uTailDocid = 0; m_pDoc1++; m_pDoc2++; } const ExtDoc_t * ExtUnit_c::GetDocsChunk ( SphDocID_t * pMaxID ) { // SENTENCE operator is essentially AND on steroids // that also takes relative dot positions into account // // when document matches both args but not the dot, it degenerates into AND // we immediately lookup and copy matching document hits anyway, though // this is suboptimal (because these hits might never be required at all) // but this is expected to be rare case, so let's keep code simple // // when document matches both args and the dot, we need to filter the hits // only those left/right pairs that are not (!) separated by a dot should match int iDoc = 0; int iMyHit = 0; if ( m_uTailDocid ) SkipTailHits(); while ( iMyHitm_uDocid==DOCID_MAX ) { m_pDoc1 = m_pDocs1 = m_pArg1->GetDocsChunk ( NULL ); if ( !m_pDoc1 ) break; // node is over } if ( !m_pDoc2 || m_pDoc2->m_uDocid==DOCID_MAX ) { m_pDoc2 = m_pDocs2 = m_pArg2->GetDocsChunk ( NULL ); if ( !m_pDoc2 ) break; // node is over } // find next candidate match while ( m_pDoc1->m_uDocid!=m_pDoc2->m_uDocid && m_pDoc1->m_uDocid!=DOCID_MAX && m_pDoc2->m_uDocid!=DOCID_MAX ) { while ( m_pDoc1->m_uDocid < m_pDoc2->m_uDocid && m_pDoc2->m_uDocid!=DOCID_MAX ) m_pDoc1++; while ( m_pDoc1->m_uDocid > m_pDoc2->m_uDocid && m_pDoc1->m_uDocid!=DOCID_MAX ) m_pDoc2++; } // got our candidate that matches AND? SphDocID_t uDocid = m_pDoc1->m_uDocid; if ( m_pDoc1->m_uDocid==DOCID_MAX || m_pDoc2->m_uDocid==DOCID_MAX ) continue; // yes, now fetch more dots docs, if needed // note how NULL is accepted here, "A and B but no dots" case is valid! if ( !m_pDotDoc || m_pDotDoc->m_uDocid==DOCID_MAX ) m_pDotDoc = m_pDotDocs = m_pDot->GetDocsChunk ( NULL ); // skip preceding docs while ( m_pDotDoc && m_pDotDoc->m_uDocid < uDocid ) { while ( m_pDotDoc->m_uDocid < uDocid ) m_pDotDoc++; if ( m_pDotDoc->m_uDocid==DOCID_MAX ) m_pDotDoc = m_pDotDocs = m_pDot->GetDocsChunk ( NULL ); } // we will need document hits on both routes below SkipHitsLtDocid ( &m_pHit1, uDocid, m_pArg1, m_pDocs1 ); SkipHitsLtDocid ( &m_pHit2, uDocid, m_pArg2, m_pDocs2 ); assert ( m_pHit1->m_uDocid==uDocid ); assert ( m_pHit2->m_uDocid==uDocid ); DWORD uSentenceEnd = 0; if ( !m_pDotDoc || m_pDotDoc->m_uDocid!=uDocid ) { // no dots in current document? // just copy all hits until next document uSentenceEnd = UINT_MAX; } else { // got both hits and dots // rewind to relevant dots hits, then do sentence boundary detection SkipHitsLtDocid ( &m_pDotHit, uDocid, m_pDot, m_pDotDocs ); } // do those hits iMyHit = FilterHits ( iMyHit, uSentenceEnd, uDocid, &iDoc ); // out of matching hits buffer? gotta return docs chunk now, then if ( iMyHit==MAX_HITS-1 ) { // mark a possibility of some trailing hits for current dot, if any if ( ( m_pHit1 && m_pHit1->m_uDocid==uDocid ) || ( m_pHit2 && m_pHit2->m_uDocid==uDocid ) ) { m_uTailDocid = uDocid; // yep, do check that tail } else { SkipTailHits(); // nope, both hit lists are definitely over } return ReturnDocsChunk ( iDoc, iMyHit, pMaxID ); } // all hits copied; do the next candidate m_pDoc1++; m_pDoc2++; } return ReturnDocsChunk ( iDoc, iMyHit, pMaxID ); } const ExtHit_t * ExtUnit_c::GetHitsChunk ( const ExtDoc_t * pDocs, SphDocID_t ) { SphDocID_t uFirstMatch = pDocs->m_uDocid; // current hits chunk already returned if ( m_uHitsOverFor==uFirstMatch ) { // and there are no trailing hits? bail if ( !m_uTailDocid ) return NULL; // and there might be trailing hits for the last document? try and loop them int iMyHit = FilterHits ( 0, m_uTailSentenceEnd, m_uTailDocid, NULL ); if ( !iMyHit ) { // no trailing hits were there actually m_uTailDocid = 0; m_pDoc1++; m_pDoc2++; return NULL; } // ok, we got some trailing hits! // check whether we might have even more if (!( iMyHit==MAX_HITS-1 && m_pHit1 && m_pHit1->m_uDocid==m_uTailDocid && m_pHit2 && m_pHit2->m_uDocid==m_uTailDocid )) { // nope, both hit lists are definitely over now m_uTailDocid = 0; m_pDoc1++; m_pDoc2++; } // return those trailing hits assert ( iMyHitm_uDocid!=pDocs->m_uDocid ) { while ( pDocs->m_uDocid < pMyHit->m_uDocid && pDocs->m_uDocid!=DOCID_MAX ) pDocs++; if ( pDocs->m_uDocid==DOCID_MAX ) break; while ( pMyHit->m_uDocid < pDocs->m_uDocid ) pMyHit++; } // out of hits if ( pMyHit->m_uDocid==DOCID_MAX || pDocs->m_uDocid==DOCID_MAX ) { // there still might be trailing hits // if so, they will be handled on next entry m_uHitsOverFor = uFirstMatch; if ( pDocs->m_uDocid==DOCID_MAX && m_uTailDocid ) SkipTailHits(); break; } // copy while ( pMyHit->m_uDocid==pDocs->m_uDocid ) m_dHits[iHit++] = *pMyHit++; if ( pMyHit->m_uDocid==DOCID_MAX ) { m_uHitsOverFor = uFirstMatch; break; } } assert ( iHit>=0 && iHit & dZones, CSphStringBuilder & tRes, int iIdent ) { if ( iIdent ) tRes.Appendf ( "\n" ); for ( int i=0; iGetOp() ) { case SPH_QUERY_AND: tRes.Appendf ( "AND(" ); break; case SPH_QUERY_OR: tRes.Appendf ( "OR(" ); break; case SPH_QUERY_NOT: tRes.Appendf ( "NOT(" ); break; case SPH_QUERY_ANDNOT: tRes.Appendf ( "ANDNOT(" ); break; case SPH_QUERY_BEFORE: tRes.Appendf ( "BEFORE(" ); break; case SPH_QUERY_PHRASE: tRes.Appendf ( "PHRASE(" ); break; case SPH_QUERY_PROXIMITY: tRes.Appendf ( "PROXIMITY(distance=%d, ", pNode->m_iOpArg ); break; case SPH_QUERY_QUORUM: tRes.Appendf ( "QUORUM(count=%d, ", pNode->m_iOpArg ); break; case SPH_QUERY_NEAR: tRes.Appendf ( "NEAR(distance=%d", pNode->m_iOpArg ); break; case SPH_QUERY_SENTENCE: tRes.Appendf ( "SENTENCE(" ); break; case SPH_QUERY_PARAGRAPH: tRes.Appendf ( "PARAGRAPH(" ); break; default: tRes.Appendf ( "OPERATOR-%d(", pNode->GetOp() ); break; } if ( pNode->m_dChildren.GetLength() && pNode->m_dWords.GetLength() ) tRes.Appendf("virtually-plain, "); // dump spec for keyword nodes // FIXME? double check that spec does *not* affect non keyword nodes if ( !pNode->m_dSpec.IsEmpty() && pNode->m_dWords.GetLength() ) { const XQLimitSpec_t & s = pNode->m_dSpec; if ( s.m_bFieldSpec && !s.m_dFieldMask.TestAll ( true ) ) { tRes.Appendf ( "fields=(" ).ResetSeparator(); ARRAY_FOREACH ( i, tSchema.m_dFields ) if ( s.m_dFieldMask.Test(i) ) tRes.AppendSeparator ( ", " ).Appendf ( "%s", tSchema.m_dFields[i].m_sName.cstr() ); tRes.Appendf ( "), " ); } if ( s.m_iFieldMaxPos ) tRes.Appendf ( "max_field_pos=%d, ", s.m_iFieldMaxPos ); if ( s.m_dZones.GetLength() ) { tRes.Appendf ( s.m_bZoneSpan ? "zonespans=(" : "zones=(" ).ResetSeparator(); ARRAY_FOREACH ( i, s.m_dZones ) tRes.AppendSeparator ( ", " ).Appendf ( "%s", dZones [ s.m_dZones[i] ].cstr() ); tRes.Appendf ( "), " ); } } if ( pNode->m_dChildren.GetLength() ) { ARRAY_FOREACH ( i, pNode->m_dChildren ) { if ( i>0 ) tRes.Appendf ( ", " ); Explain ( pNode->m_dChildren[i], tSchema, dZones, tRes, iIdent+1 ); } } else { ARRAY_FOREACH ( i, pNode->m_dWords ) { const XQKeyword_t & w = pNode->m_dWords[i]; if ( i>0 ) tRes.Appendf(", "); tRes.Appendf ( "KEYWORD(%s, querypos=%d", w.m_sWord.cstr(), w.m_iAtomPos ); if ( w.m_bExcluded ) tRes.Appendf ( ", excluded" ); if ( w.m_bExpanded ) tRes.Appendf ( ", expanded" ); if ( w.m_bFieldStart ) tRes.Appendf ( ", field_start" ); if ( w.m_bFieldEnd ) tRes.Appendf ( ", field_end" ); if ( w.m_bMorphed ) tRes.Appendf ( ", morphed" ); tRes.Appendf ( ")" ); } } tRes.Appendf(")"); } ExtRanker_c::ExtRanker_c ( const XQQuery_t & tXQ, const ISphQwordSetup & tSetup ) { assert ( tSetup.m_pCtx ); m_iInlineRowitems = tSetup.m_iInlineRowitems; for ( int i=0; iDebugDump(0); #endif // we generally have three (!) trees for each query // 1) parsed tree, a raw result of query parsing // 2) transformed tree, with star expansions, morphology, and other transfomations // 3) evaluation tree, with tiny keywords cache, and other optimizations // tXQ.m_pRoot, passed to ranker from the index, is the transformed tree // m_pRoot, internal to ranker, is the evaluation tree if ( tSetup.m_pCtx->m_pProfile ) { tSetup.m_pCtx->m_pProfile->m_sTransformedTree.Reset(); Explain ( tXQ.m_pRoot, tSetup.m_pIndex->GetMatchSchema(), tXQ.m_dZones, tSetup.m_pCtx->m_pProfile->m_sTransformedTree, 0 ); } m_pDoclist = NULL; m_pHitlist = NULL; m_uMaxID = 0; m_uPayloadMask = 0; m_iQwords = 0; m_pIndex = tSetup.m_pIndex; m_pCtx = tSetup.m_pCtx; m_pNanoBudget = tSetup.m_pStats ? tSetup.m_pStats->m_pNanoBudget : NULL; m_dZones = tXQ.m_dZones; m_dZoneStart.Resize ( m_dZones.GetLength() ); m_dZoneEnd.Resize ( m_dZones.GetLength() ); m_dZoneMax.Resize ( m_dZones.GetLength() ); m_dZoneMin.Resize ( m_dZones.GetLength() ); m_dZoneMax.Fill ( 0 ); m_dZoneMin.Fill ( DOCID_MAX ); m_bZSlist = tXQ.m_bNeedSZlist; CSphDict * pZonesDict = NULL; // workaround for a particular case with following conditions if ( m_pIndex->IsStarEnabled() && !m_pIndex->GetDictionary()->GetSettings().m_bWordDict && m_dZones.GetLength() ) pZonesDict = m_pIndex->GetDictionary()->Clone(); ARRAY_FOREACH ( i, m_dZones ) { XQKeyword_t tDot; tDot.m_sWord.SetSprintf ( "%c%s", MAGIC_CODE_ZONE, m_dZones[i].cstr() ); m_dZoneStartTerm.Add ( new ExtTerm_c ( CreateQueryWord ( tDot, tSetup, pZonesDict ), tSetup ) ); m_dZoneStart[i] = NULL; tDot.m_sWord.SetSprintf ( "%c/%s", MAGIC_CODE_ZONE, m_dZones[i].cstr() ); m_dZoneEndTerm.Add ( new ExtTerm_c ( CreateQueryWord ( tDot, tSetup, pZonesDict ), tSetup ) ); m_dZoneEnd[i] = NULL; } SafeDelete ( pZonesDict ); } ExtRanker_c::~ExtRanker_c () { SafeDelete ( m_pRoot ); ARRAY_FOREACH ( i, m_dZones ) { SafeDelete ( m_dZoneStartTerm[i] ); SafeDelete ( m_dZoneEndTerm[i] ); } } void ExtRanker_c::Reset ( const ISphQwordSetup & tSetup ) { if ( m_pRoot ) m_pRoot->Reset ( tSetup ); ARRAY_FOREACH ( i, m_dZones ) { m_dZoneStartTerm[i]->Reset ( tSetup ); m_dZoneEndTerm[i]->Reset ( tSetup ); m_dZoneStart[i] = NULL; m_dZoneEnd[i] = NULL; } m_dZoneMax.Fill ( 0 ); m_dZoneMin.Fill ( DOCID_MAX ); m_hZoneInfo.Reset(); } const ExtDoc_t * ExtRanker_c::GetFilteredDocs () { ESphQueryState eState = SPH_QSTATE_TOTAL; CSphQueryProfile * pProfile = m_pCtx->m_pProfile; for ( ;; ) { // get another chunk m_uMaxID = 0; if ( pProfile ) eState = pProfile->Switch ( SPH_QSTATE_GET_DOCS ); const ExtDoc_t * pCand = m_pRoot->GetDocsChunk ( &m_uMaxID ); if ( !pCand ) { if ( pProfile ) pProfile->Switch ( eState ); return NULL; } // create matches, and filter them if ( pProfile ) pProfile->Switch ( SPH_QSTATE_FILTER ); int iDocs = 0; while ( pCand->m_uDocid!=DOCID_MAX ) { m_tTestMatch.m_iDocID = pCand->m_uDocid; if ( pCand->m_pDocinfo ) memcpy ( m_tTestMatch.m_pDynamic, pCand->m_pDocinfo, m_iInlineRowitems*sizeof(CSphRowitem) ); if ( m_pIndex->EarlyReject ( m_pCtx, m_tTestMatch ) ) { pCand++; continue; } m_dMyDocs[iDocs] = *pCand; m_tTestMatch.m_iWeight = (int)( (pCand->m_fTFIDF+0.5f)*SPH_BM25_SCALE ); // FIXME! bench bNeedBM25 Swap ( m_tTestMatch, m_dMyMatches[iDocs] ); iDocs++; pCand++; } // clean up zone hash if ( m_uMaxID!=DOCID_MAX ) { ARRAY_FOREACH ( i, m_dZoneMin ) { SphDocID_t uMinDocid = m_dZoneMin[i]; if ( uMinDocid==DOCID_MAX ) continue; Verify ( m_hZoneInfo.IterateStart ( ZoneKey_t ( i, uMinDocid ) ) ); uMinDocid = DOCID_MAX; do { ZoneKey_t tKey = m_hZoneInfo.IterateGetKey(); if ( tKey.m_iZone!=i || tKey.m_uDocid>m_uMaxID ) { uMinDocid = ( tKey.m_iZone==i ) ? tKey.m_uDocid : DOCID_MAX; break; } m_hZoneInfo.Delete ( tKey ); } while ( m_hZoneInfo.IterateNext() ); m_dZoneMin[i] = uMinDocid; } } if ( iDocs ) { if ( m_pNanoBudget ) *m_pNanoBudget -= g_iPredictorCostMatch*iDocs; m_dMyDocs[iDocs].m_uDocid = DOCID_MAX; if ( pProfile ) pProfile->Switch ( eState ); return m_dMyDocs; } } } void ExtRanker_c::SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) { m_iQwords = hQwords.GetLength (); if ( m_pRoot ) m_pRoot->SetQwordsIDF ( hQwords ); } SphZoneHit_e ExtRanker_c::IsInZone ( int iZone, const ExtHit_t * pHit, int * pLastSpan ) { // quick route, we have current docid cached ZoneKey_t tKey ( iZone, pHit->m_uDocid ); // OPTIMIZE? allow 2-component hash keys maybe? ZoneInfo_t * pZone = m_hZoneInfo ( tKey ); if ( pZone ) { // remove end markers that might mess up ordering Hitpos_t uPos = HITMAN::GetLCS ( pHit->m_uHitpos ); int iSpan = FindSpan ( pZone->m_dStarts, uPos ); if ( iSpan<0 || uPos>pZone->m_dEnds[iSpan] ) return SPH_ZONE_NO_SPAN; if ( pLastSpan ) *pLastSpan = iSpan; return SPH_ZONE_FOUND; } // is there any zone info for this document at all? if ( pHit->m_uDocid<=m_dZoneMax[iZone] ) return SPH_ZONE_NO_DOCUMENT; // long route, read in zone info for all (!) the documents until next requested // that's because we might be queried out of order // current chunk const ExtDoc_t * pStart = m_dZoneStart[iZone]; const ExtDoc_t * pEnd = m_dZoneEnd[iZone]; // now keep caching spans until we see current id while ( pHit->m_uDocid > m_dZoneMax[iZone] ) { // get more docs if needed if ( ( !pStart && m_dZoneMax[iZone]!=DOCID_MAX ) || pStart->m_uDocid==DOCID_MAX ) { pStart = m_dZoneStartTerm[iZone]->GetDocsChunk ( NULL ); if ( !pStart ) { m_dZoneMax[iZone] = DOCID_MAX; return SPH_ZONE_NO_DOCUMENT; } } if ( ( !pEnd && m_dZoneMax[iZone]!=DOCID_MAX ) || pEnd->m_uDocid==DOCID_MAX ) { pEnd = m_dZoneEndTerm[iZone]->GetDocsChunk ( NULL ); if ( !pEnd ) { m_dZoneMax[iZone] = DOCID_MAX; return SPH_ZONE_NO_DOCUMENT; } } assert ( pStart && pEnd ); // skip zone starts past already cached stuff while ( pStart->m_uDocid<=m_dZoneMax[iZone] ) pStart++; if ( pStart->m_uDocid==DOCID_MAX ) continue; // skip zone ends until a match with start while ( pEnd->m_uDocidm_uDocid ) pEnd++; if ( pEnd->m_uDocid==DOCID_MAX ) continue; // handle mismatching start/end ids // (this must never happen normally, but who knows what data we're fed) assert ( pStart->m_uDocid!=DOCID_MAX ); assert ( pEnd->m_uDocid!=DOCID_MAX ); assert ( pStart->m_uDocid<=pEnd->m_uDocid ); if ( pStart->m_uDocid!=pEnd->m_uDocid ) { while ( pStart->m_uDocid < pEnd->m_uDocid ) pStart++; if ( pStart->m_uDocid==DOCID_MAX ) continue; } // first matching uncached docid found! assert ( pStart->m_uDocid==pEnd->m_uDocid ); assert ( pStart->m_uDocid > m_dZoneMax[iZone] ); // but maybe we don't need docid this big just yet? if ( pStart->m_uDocid > pHit->m_uDocid ) { // store current in-chunk positions m_dZoneStart[iZone] = pStart; m_dZoneEnd[iZone] = pEnd; // no zone info for all those precending documents (including requested one) m_dZoneMax[iZone] = pStart->m_uDocid-1; return SPH_ZONE_NO_DOCUMENT; } // cache all matching docs from current chunks below requested docid // (there might be more matching docs, but we are lazy and won't cache them upfront) ExtDoc_t dCache [ ExtNode_i::MAX_DOCS ]; int iCache = 0; while ( pStart->m_uDocid<=pHit->m_uDocid ) { // match if ( pStart->m_uDocid==pEnd->m_uDocid ) { dCache[iCache++] = *pStart; pStart++; pEnd++; continue; } // mismatch! // this must not really happen, starts/ends must be in sync // but let's be graceful anyway, and just skip to next match if ( pStart->m_uDocid==DOCID_MAX || pEnd->m_uDocid==DOCID_MAX ) break; while ( pStart->m_uDocid < pEnd->m_uDocid ) pStart++; if ( pStart->m_uDocid==DOCID_MAX ) break; while ( pEnd->m_uDocid < pStart->m_uDocid ) pEnd++; if ( pEnd->m_uDocid==DOCID_MAX ) break; } // should have found at least one id to cache assert ( iCache ); assert ( iCache < ExtNode_i::MAX_DOCS ); dCache[iCache].m_uDocid = DOCID_MAX; // do caching const ExtHit_t * pStartHits = m_dZoneStartTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX ); const ExtHit_t * pEndHits = m_dZoneEndTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX ); // loop documents one by one while ( pStartHits && pEndHits ) { // load all hits for current document SphDocID_t uCur = pStartHits->m_uDocid; tKey.m_uDocid = uCur; pZone = &m_hZoneInfo.AddUnique ( tKey ); assert ( pEndHits->m_uDocid==uCur ); // load all the pairs of start and end hits for it // do it by with the FSM: // // state 'begin': // - start marker -> set state 'inspan', startspan=pos++ // - end marker -> pos++ // - end of doc -> set state 'finish' // // state 'inspan': // - start marker -> startspan = pos++ // - end marker -> set state 'outspan', endspan=pos++ // - end of doc -> set state 'finish' // // state 'outspan': // - start marker -> set state 'inspan', commit span, startspan=pos++ // - end marker -> endspan = pos++ // - end of doc -> set state 'finish', commit span // // state 'finish': // - we are done. int bEofDoc = 0; // state 'begin' is here. while ( !bEofDoc && pEndHits->m_uHitpos < pStartHits->m_uHitpos ) { ++pEndHits; bEofDoc |= (pEndHits->m_uDocid!=uCur)?2:0; } if ( !bEofDoc ) { // state 'inspan' (true) or 'outspan' (false) bool bInSpan = true; Hitpos_t iSpanBegin = pStartHits->m_uHitpos; Hitpos_t iSpanEnd = pEndHits->m_uHitpos; while ( bEofDoc!=3 ) /// action end-of-doc { // action inspan/start-marker if ( bInSpan ) { ++pStartHits; bEofDoc |= (pStartHits->m_uDocid!=uCur)?1:0; } else // action outspan/end-marker { ++pEndHits; bEofDoc |= (pEndHits->m_uDocid!=uCur)?2:0; } if ( pStartHits->m_uHitposm_uHitpos && !( bEofDoc & 1 ) ) { // actions for outspan/start-marker state // ....... will ignore all the inside. if ( !bInSpan ) { bInSpan = true; pZone->m_dStarts.Add ( iSpanBegin ); pZone->m_dEnds.Add ( iSpanEnd ); iSpanBegin = pStartHits->m_uHitpos; } } else if ( !( bEofDoc & 2 ) ) { // actions for inspan/end-marker state // so, ....... will ignore all the inside. bInSpan = false; iSpanEnd = pEndHits->m_uHitpos; } } // action 'commit' for outspan/end-of-doc if ( iSpanBegin < iSpanEnd ) { pZone->m_dStarts.Add ( iSpanBegin ); pZone->m_dEnds.Add ( iSpanEnd ); } if ( pStartHits->m_uDocid==DOCID_MAX ) pStartHits = m_dZoneStartTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX ); if ( pEndHits->m_uDocid==DOCID_MAX ) pEndHits = m_dZoneEndTerm[iZone]->GetHitsChunk ( dCache, DOCID_MAX ); } // data sanity checks assert ( pZone->m_dStarts.GetLength()==pZone->m_dEnds.GetLength() ); // update cache status m_dZoneMax[iZone] = uCur; m_dZoneMin[iZone] = Min ( m_dZoneMin[iZone], uCur ); } } // store current in-chunk positions m_dZoneStart[iZone] = pStart; m_dZoneEnd[iZone] = pEnd; // cached a bunch of spans, try our check again tKey.m_uDocid = pHit->m_uDocid; pZone = m_hZoneInfo ( tKey ); if ( pZone ) { // remove end markers that might mess up ordering Hitpos_t uPos = HITMAN::GetLCS ( pHit->m_uHitpos ); int iSpan = FindSpan ( pZone->m_dStarts, uPos ); if ( iSpan<0 || uPos>pZone->m_dEnds[iSpan] ) return SPH_ZONE_NO_SPAN; if ( pLastSpan ) *pLastSpan = iSpan; return SPH_ZONE_FOUND; } return SPH_ZONE_NO_DOCUMENT; } ////////////////////////////////////////////////////////////////////////// template < bool USE_BM25 > int ExtRanker_WeightSum_c::GetMatches () { if ( !m_pRoot ) return 0; const ExtDoc_t * pDoc = m_pDoclist; int iMatches = 0; while ( iMatchesm_uDocid==DOCID_MAX ) pDoc = GetFilteredDocs (); if ( !pDoc ) { m_pDoclist = NULL; return iMatches; } DWORD uRank = 0; DWORD uMask = pDoc->m_uDocFields; if ( !uMask ) { // possible if we have more than 32 fields // honestly loading all hits etc is cumbersome, so let's just fake it uRank = 1; } else { // just sum weights over the lowest 32 fields int iWeights = Min ( m_iWeights, 32 ); for ( int i=0; im_uDocFields & (1<m_uDocid==DOCID_MAX ) pDoc = GetFilteredDocs (); if ( !pDoc ) { m_pDoclist = NULL; return iMatches; } Swap ( m_dMatches[iMatches], m_dMyMatches[pDoc-m_dMyDocs] ); // OPTIMIZE? can avoid this swap and simply return m_dMyMatches (though in lesser chunks) m_dMatches[iMatches].m_iWeight = 1; iMatches++; pDoc++; } m_pDoclist = pDoc; return iMatches; } ////////////////////////////////////////////////////////////////////////// template < typename STATE > ExtRanker_T::ExtRanker_T ( const XQQuery_t & tXQ, const ISphQwordSetup & tSetup ) : ExtRanker_c ( tXQ, tSetup ) { if ( m_bZSlist && !m_pRoot->GetExtraData ( EXTRA_GET_DATA_ZONESPANS, (void**) & m_pZones )) m_bZSlist = false; if ( m_bZSlist ) { m_dZonespans.Reserve ( ExtNode_i::MAX_DOCS ); } m_dZonespans.Resize ( 1 ); m_pHitBase = NULL; } static inline const ExtHit_t * RankerGetHits ( CSphQueryProfile * pProfile, ExtNode_i * pRoot, const ExtDoc_t * pDocs, SphDocID_t uMaxID ) { if ( !pProfile ) return pRoot->GetHitsChunk ( pDocs, uMaxID ); pProfile->Switch ( SPH_QSTATE_GET_HITS ); const ExtHit_t * pHlist = pRoot->GetHitsChunk ( pDocs, uMaxID ); pProfile->Switch ( SPH_QSTATE_RANK ); return pHlist; } template < typename STATE > int ExtRanker_T::GetMatches () { if ( !m_pRoot ) return 0; CSphQueryProfile * pProfile = m_pCtx->m_pProfile; int iMatches = 0; const ExtHit_t * pHlist = m_pHitlist; const ExtHit_t * pHitBase = m_pHitBase; const ExtDoc_t * pDocs = m_pDoclist; m_dZonespans.Resize(1); int iLastZoneData = 0; bool bZSlist = m_bZSlist; CSphVector dSpans; if ( bZSlist ) { dSpans.Resize ( m_dZones.GetLength() ); ARRAY_FOREACH ( i, dSpans ) dSpans[i] = -1; } // warmup if necessary if ( !pHlist ) { if ( !pDocs ) pDocs = GetFilteredDocs (); if ( !pDocs ) return iMatches; pHlist = RankerGetHits ( pProfile, m_pRoot, pDocs, m_uMaxID ); if ( !pHlist ) return iMatches; } if ( !pHitBase ) pHitBase = pHlist; // main matching loop const ExtDoc_t * pDoc = pDocs; for ( SphDocID_t uCurDocid=0; iMatchesm_uDocid==uCurDocid ) { m_tState.Update ( pHlist ); if ( bZSlist ) { int * pZones = m_pZones->GetZVec ( pHlist-pHitBase ); ARRAY_FOREACH ( i, m_dZones ) if ( pZones[i]>=0 && dSpans[i]!=pZones[i] ) { m_dZonespans.Add ( i ); m_dZonespans.Add ( pZones[i] ); dSpans[i] = pZones[i]; } } ++pHlist; } // if hits block is over, get next block, but do *not* flush current doc if ( pHlist->m_uDocid==DOCID_MAX ) { assert ( pDocs ); pHlist = RankerGetHits ( pProfile, m_pRoot, pDocs, m_uMaxID ); if ( pHlist ) continue; } // otherwise (new match or no next hits block), flush current doc if ( uCurDocid ) { assert ( uCurDocid==pDoc->m_uDocid ); Swap ( m_dMatches[iMatches], m_dMyMatches[pDoc-m_dMyDocs] ); m_dMatches[iMatches].m_iWeight = m_tState.Finalize ( m_dMatches[iMatches] ); if ( bZSlist ) { m_dZonespans[iLastZoneData] = m_dZonespans.GetLength()-iLastZoneData-1; m_dMatches[iMatches].m_iTag = iLastZoneData; iLastZoneData = m_dZonespans.GetLength(); m_dZonespans.Add(0); ARRAY_FOREACH ( i, dSpans ) dSpans[i] = -1; } iMatches++; } // boundary checks if ( !pHlist ) { // there are no more hits for current docs block; do we have a next one? assert ( pDocs ); pDoc = pDocs = GetFilteredDocs (); // we don't, so bail out if ( !pDocs ) break; // we do, get some hits pHlist = m_pRoot->GetHitsChunk ( pDocs, m_uMaxID ); assert ( pHlist ); // fresh docs block, must have hits } // skip until next good doc/hit pair assert ( pDoc->m_uDocid<=pHlist->m_uDocid ); while ( pDoc->m_uDocidm_uDocid ) pDoc++; assert ( pDoc->m_uDocid==pHlist->m_uDocid ); uCurDocid = pHlist->m_uDocid; } m_pDoclist = pDocs; m_pHitlist = pHlist; if ( !m_pHitBase ) m_pHitBase = pHitBase; return iMatches; } ////////////////////////////////////////////////////////////////////////// #if USE_WINDOWS #pragma warning(disable:4127) // conditional expr is const for MSVC #endif template < bool USE_BM25, bool HANDLE_DUPES > struct RankerState_Proximity_fn : public ISphExtra { BYTE m_uLCS[SPH_MAX_FIELDS]; BYTE m_uCurLCS; int m_iExpDelta; int m_iFields; const int * m_pWeights; DWORD m_uLcsTailPos; DWORD m_uLcsTailQposMask; DWORD m_uCurQposMask; DWORD m_uCurPos; bool Init ( int iFields, const int * pWeights, ExtRanker_c *, CSphString & ) { memset ( m_uLCS, 0, sizeof(m_uLCS) ); m_uCurLCS = 0; m_iExpDelta = -INT_MAX; m_iFields = iFields; m_pWeights = pWeights; m_uLcsTailPos = 0; m_uLcsTailQposMask = 0; m_uCurQposMask = 0; m_uCurPos = 0; return true; } void Update ( const ExtHit_t * pHlist ) { if ( !HANDLE_DUPES ) { // all query keywords are unique // simpler path (just do the delta) int iDelta = HITMAN::GetLCS ( pHlist->m_uHitpos ) - pHlist->m_uQuerypos; if ( iDelta==m_iExpDelta ) m_uCurLCS = m_uCurLCS + BYTE(pHlist->m_uWeight); else m_uCurLCS = BYTE(pHlist->m_uWeight); DWORD uField = HITMAN::GetField ( pHlist->m_uHitpos ); if ( m_uCurLCS>m_uLCS[uField] ) m_uLCS[uField] = m_uCurLCS; m_iExpDelta = iDelta + pHlist->m_uSpanlen - 1; // !COMMIT why spanlen?? } else { // keywords are duplicated in the query // so there might be multiple qpos entries sharing the same hitpos DWORD uPos = HITMAN::GetLCS ( pHlist->m_uHitpos ); DWORD uField = HITMAN::GetField ( pHlist->m_uHitpos ); // reset accumulated data from previous field if ( (DWORD)HITMAN::GetField ( m_uCurPos )!=uField ) m_uCurQposMask = 0; if ( uPos!=m_uCurPos ) { // next new and shiny hitpos in line // FIXME!? what do we do with longer spans? keep looking? reset? if ( m_uCurLCS<2 ) { m_uLcsTailPos = m_uCurPos; m_uLcsTailQposMask = m_uCurQposMask; m_uCurLCS = 1; // FIXME!? can this ever be different? ("a b" c) maybe.. } m_uCurQposMask = 0; m_uCurPos = uPos; if ( m_uLCS[uField] < pHlist->m_uWeight ) m_uLCS[uField] = BYTE(pHlist->m_uWeight); } // add that qpos to current qpos mask (for the current hitpos) m_uCurQposMask |= ( 1UL << pHlist->m_uQuerypos ); // and check if that results in a better lcs match now int iDelta = m_uCurPos - m_uLcsTailPos; if ( ( m_uCurQposMask >> iDelta ) & m_uLcsTailQposMask ) { // cool, it matched! m_uLcsTailQposMask = ( 1UL << pHlist->m_uQuerypos ); // our lcs span now ends with a specific qpos m_uLcsTailPos = m_uCurPos; // and in a specific position m_uCurLCS = BYTE ( m_uCurLCS + pHlist->m_uWeight ); // and it's longer m_uCurQposMask = 0; // and we should avoid matching subsequent hits on the same hitpos // update per-field vector if ( m_uCurLCS>m_uLCS[uField] ) m_uLCS[uField] = m_uCurLCS; } } } DWORD Finalize ( const CSphMatch & tMatch ) { m_uCurLCS = 0; m_iExpDelta = -1; if ( HANDLE_DUPES ) { m_uLcsTailPos = 0; m_uLcsTailQposMask = 0; m_uCurQposMask = 0; m_uCurPos = 0; } DWORD uRank = 0; for ( int i=0; im_iMaxQpos; return true; } void Update ( const ExtHit_t * pHlist ) { // upd LCS DWORD uField = HITMAN::GetField ( pHlist->m_uHitpos ); int iDelta = HITMAN::GetLCS ( pHlist->m_uHitpos ) - pHlist->m_uQuerypos; if ( iDelta==m_iExpDelta && HITMAN::GetLCS ( pHlist->m_uHitpos )>=m_uMinExpPos ) { m_uCurLCS = m_uCurLCS + BYTE(pHlist->m_uWeight); if ( HITMAN::IsEnd ( pHlist->m_uHitpos ) && (int)pHlist->m_uQuerypos==m_iMaxQuerypos && HITMAN::GetPos ( pHlist->m_uHitpos )==m_iMaxQuerypos ) { m_uExactHit |= ( 1UL << HITMAN::GetField ( pHlist->m_uHitpos ) ); } } else { m_uCurLCS = BYTE(pHlist->m_uWeight); if ( HITMAN::GetPos ( pHlist->m_uHitpos )==1 ) { m_uHeadHit |= ( 1UL << HITMAN::GetField ( pHlist->m_uHitpos ) ); if ( HITMAN::IsEnd ( pHlist->m_uHitpos ) && m_iMaxQuerypos==1 ) m_uExactHit |= ( 1UL << HITMAN::GetField ( pHlist->m_uHitpos ) ); } } if ( m_uCurLCS>m_uLCS[uField] ) m_uLCS[uField] = m_uCurLCS; m_iExpDelta = iDelta + pHlist->m_uSpanlen - 1; m_uMinExpPos = HITMAN::GetLCS ( pHlist->m_uHitpos ) + 1; } DWORD Finalize ( const CSphMatch & tMatch ) { m_uCurLCS = 0; m_iExpDelta = -1; DWORD uRank = 0; for ( int i=0; i>i)&1) + ((m_uExactHit>>i)&1) )*m_pWeights[i]; m_uLCS[i] = 0; } m_uHeadHit = 0; m_uExactHit = 0; return tMatch.m_iWeight + uRank*SPH_BM25_SCALE; } }; template < bool USE_BM25 > struct RankerState_ProximityPayload_fn : public RankerState_Proximity_fn { DWORD m_uPayloadRank; DWORD m_uPayloadMask; bool Init ( int iFields, const int * pWeights, ExtRanker_c * pRanker, CSphString & sError ) { RankerState_Proximity_fn::Init ( iFields, pWeights, pRanker, sError ); m_uPayloadRank = 0; m_uPayloadMask = pRanker->m_uPayloadMask; return true; } void Update ( const ExtHit_t * pHlist ) { DWORD uField = HITMAN::GetField ( pHlist->m_uHitpos ); if ( ( 1<m_uPayloadRank += HITMAN::GetPos ( pHlist->m_uHitpos ) * this->m_pWeights[uField]; else RankerState_Proximity_fn::Update ( pHlist ); } DWORD Finalize ( const CSphMatch & tMatch ) { // as usual, redundant 'this' is just because gcc is stupid this->m_uCurLCS = 0; this->m_iExpDelta = -1; DWORD uRank = m_uPayloadRank; for ( int i=0; im_iFields; i++ ) { // no special care for payload fields as their LCS will be 0 anyway uRank += this->m_uLCS[i]*this->m_pWeights[i]; this->m_uLCS[i] = 0; } m_uPayloadRank = 0; return USE_BM25 ? tMatch.m_iWeight + uRank*SPH_BM25_SCALE : uRank; } }; ////////////////////////////////////////////////////////////////////////// struct RankerState_MatchAny_fn : public RankerState_Proximity_fn { int m_iPhraseK; BYTE m_uMatchMask[SPH_MAX_FIELDS]; bool Init ( int iFields, const int * pWeights, ExtRanker_c * pRanker, CSphString & sError ) { RankerState_Proximity_fn::Init ( iFields, pWeights, pRanker, sError ); m_iPhraseK = 0; for ( int i=0; im_iQwords; memset ( m_uMatchMask, 0, sizeof(m_uMatchMask) ); return true; } void Update ( const ExtHit_t * pHlist ) { RankerState_Proximity_fn::Update ( pHlist ); m_uMatchMask [ HITMAN::GetField ( pHlist->m_uHitpos ) ] |= ( 1<<(pHlist->m_uQuerypos-1) ); } DWORD Finalize ( const CSphMatch & ) { m_uCurLCS = 0; m_iExpDelta = -1; DWORD uRank = 0; for ( int i=0; im_uHitpos ) ]; } DWORD Finalize ( const CSphMatch & ) { DWORD uRes = m_uRank; m_uRank = 0; return uRes; } }; ////////////////////////////////////////////////////////////////////////// struct RankerState_Fieldmask_fn : public ISphExtra { DWORD m_uRank; bool Init ( int, const int *, ExtRanker_c *, CSphString & ) { m_uRank = 0; return true; } void Update ( const ExtHit_t * pHlist ) { m_uRank |= 1UL << HITMAN::GetField ( pHlist->m_uHitpos ); } DWORD Finalize ( const CSphMatch & ) { DWORD uRes = m_uRank; m_uRank = 0; return uRes; } }; ////////////////////////////////////////////////////////////////////////// class FactorPool_c { public: FactorPool_c (); void Prealloc ( int iElementSize, int nElements ); BYTE * Alloc (); void Free ( BYTE * pPtr ); int GetElementSize() const; int GetIntElementSize () const; void AddToHash ( SphDocID_t iId, BYTE * pPacked ); void AddRef ( SphDocID_t iId ); void Release ( SphDocID_t iId ); void Flush (); bool IsInitialized() const; SphFactorHash_t * GetHashPtr(); private: int m_iElementSize; int m_iNextFree; CSphTightVector m_dPool; CSphTightVector m_dFree; SphFactorHash_t m_dHash; SphFactorHashEntry_t * Find ( SphDocID_t iId ) const; inline DWORD HashFunc ( SphDocID_t iId ) const; bool FlushEntry ( SphFactorHashEntry_t * pEntry ); }; FactorPool_c::FactorPool_c () : m_iElementSize ( 0 ) , m_iNextFree ( 0 ) { } void FactorPool_c::Prealloc ( int iElementSize, int nElements ) { m_iElementSize = iElementSize; m_dPool.Resize ( nElements*GetIntElementSize() ); m_dHash.Resize ( nElements ); m_dFree.Reserve ( nElements ); memset ( m_dHash.Begin(), 0, sizeof(m_dHash[0])*m_dHash.GetLength() ); } BYTE * FactorPool_c::Alloc () { if ( m_dFree.GetLength() ) { int iIndex = m_dFree.Pop(); return m_dPool.Begin() + iIndex*GetIntElementSize(); } assert ( m_iNextFree<=m_dPool.GetLength() / GetIntElementSize() ); BYTE * pAllocated = m_dPool.Begin()+m_iNextFree*GetIntElementSize(); m_iNextFree++; return pAllocated; } void FactorPool_c::Free ( BYTE * pPtr ) { if ( !pPtr ) return; assert ( (pPtr-m_dPool.Begin() ) % GetIntElementSize()==0); assert ( pPtr>=m_dPool.Begin() && pPtr<&( m_dPool.Last() ) ); int iIndex = ( pPtr-m_dPool.Begin() )/GetIntElementSize(); m_dFree.Add(iIndex); } int FactorPool_c::GetIntElementSize () const { return m_iElementSize+sizeof(SphFactorHashEntry_t); } int FactorPool_c::GetElementSize() const { return m_iElementSize; } void FactorPool_c::AddToHash ( SphDocID_t iId, BYTE * pPacked ) { SphFactorHashEntry_t * pNew = (SphFactorHashEntry_t *)(pPacked+m_iElementSize); memset ( pNew, 0, sizeof(SphFactorHashEntry_t) ); DWORD uKey = HashFunc(iId); if ( m_dHash[uKey] ) { SphFactorHashEntry_t * pStart = m_dHash[uKey]; pNew->m_pPrev = NULL; pNew->m_pNext = pStart; pStart->m_pPrev = pNew; } pNew->m_pData = pPacked; pNew->m_iId = iId; m_dHash[uKey] = pNew; } SphFactorHashEntry_t * FactorPool_c::Find ( SphDocID_t iId ) const { DWORD uKey = HashFunc(iId); if ( m_dHash[uKey] ) { SphFactorHashEntry_t * pEntry = m_dHash[uKey]; while ( pEntry ) { if ( pEntry->m_iId==iId ) return pEntry; pEntry = pEntry->m_pNext; } } return NULL; } void FactorPool_c::AddRef ( SphDocID_t iId ) { if ( !iId ) return; SphFactorHashEntry_t * pEntry = Find ( iId ); if ( pEntry ) pEntry->m_iRefCount++; } void FactorPool_c::Release ( SphDocID_t iId ) { if ( !iId ) return; SphFactorHashEntry_t * pEntry = Find ( iId ); if ( pEntry ) { pEntry->m_iRefCount--; bool bHead = !pEntry->m_pPrev; SphFactorHashEntry_t * pNext = pEntry->m_pNext; if ( FlushEntry ( pEntry ) && bHead ) m_dHash[HashFunc(iId)] = pNext; } } bool FactorPool_c::FlushEntry ( SphFactorHashEntry_t * pEntry ) { assert ( pEntry->m_iRefCount>=0 ); if ( pEntry->m_iRefCount ) return false; assert ( pEntry ); if ( pEntry->m_pPrev ) pEntry->m_pPrev->m_pNext = pEntry->m_pNext; if ( pEntry->m_pNext ) pEntry->m_pNext->m_pPrev = pEntry->m_pPrev; Free ( pEntry->m_pData ); return true; }; void FactorPool_c::Flush() { ARRAY_FOREACH ( i, m_dHash ) { SphFactorHashEntry_t * pEntry = m_dHash[i]; while ( pEntry ) { SphFactorHashEntry_t * pNext = pEntry->m_pNext; bool bHead = !pEntry->m_pPrev; if ( FlushEntry(pEntry) && bHead ) m_dHash[i] = pNext; pEntry = pNext; } } } inline DWORD FactorPool_c::HashFunc ( SphDocID_t iId ) const { return iId % m_dHash.GetLength(); } bool FactorPool_c::IsInitialized() const { return !!m_iElementSize; } SphFactorHash_t * FactorPool_c::GetHashPtr () { return &m_dHash; } ////////////////////////////////////////////////////////////////////////// // EXPRESSION RANKER ////////////////////////////////////////////////////////////////////////// /// ranker state that computes weight dynamically based on user supplied expression (formula) template < bool NEED_PACKEDFACTORS = false, bool HANDLE_DUPES = false > struct RankerState_Expr_fn : public ISphExtra { public: // per-field and per-document stuff BYTE m_uLCS[SPH_MAX_FIELDS]; BYTE m_uMatchMask[SPH_MAX_FIELDS]; BYTE m_uCurLCS; DWORD m_uCurPos; DWORD m_uLcsTailPos; DWORD m_uLcsTailQposMask; DWORD m_uCurQposMask; int m_iExpDelta; int m_iFields; const int * m_pWeights; DWORD m_uDocBM25; CSphBitvec m_tMatchedFields; int m_iCurrentField; DWORD m_uHitCount[SPH_MAX_FIELDS]; DWORD m_uWordCount[SPH_MAX_FIELDS]; CSphVector m_dIDF; float m_dTFIDF[SPH_MAX_FIELDS]; float m_dMinIDF[SPH_MAX_FIELDS]; float m_dMaxIDF[SPH_MAX_FIELDS]; float m_dSumIDF[SPH_MAX_FIELDS]; int m_iMinHitPos[SPH_MAX_FIELDS]; int m_iMinBestSpanPos[SPH_MAX_FIELDS]; DWORD m_uExactHit; CSphBitvec m_tKeywords; DWORD m_uDocWordCount; int m_iMaxWindowHits[SPH_MAX_FIELDS]; CSphVector m_dTF; ///< for bm25a float m_fDocBM25A; ///< for bm25a CSphVector m_dFieldTF; ///< for bm25f, per-field layout (ie all field0 tfs, then all field1 tfs, etc) const char * m_sExpr; ISphExpr * m_pExpr; ESphAttr m_eExprType; const CSphSchema * m_pSchema; CSphAttrLocator m_tFieldLensLoc; float m_fAvgDocLen; const int64_t * m_pFieldLens; int64_t m_iTotalDocuments; float m_fParamK1; float m_fParamB; int m_iMaxQpos; ///< among all words, including dupes CSphVector m_dTermDupes; CSphVector m_dTermsHit; FactorPool_c m_tFactorPool; int m_iMaxMatches; // per-query stuff int m_iMaxLCS; int m_iQueryWordCount; public: // internal state, and factor settings CSphVector m_dWindow; int m_iWindowSize; public: RankerState_Expr_fn (); ~RankerState_Expr_fn (); bool Init ( int iFields, const int * pWeights, ExtRanker_c * pRanker, CSphString & sError ); void Update ( const ExtHit_t * pHlist ); DWORD Finalize ( const CSphMatch & tMatch ); bool IsTermSkipped ( int iTerm ); public: /// setup per-keyword data needed to compute the factors /// (namely IDFs, counts, masks etc) /// WARNING, CALLED EVEN BEFORE INIT()! void SetQwords ( const ExtQwordsHash_t & hQwords ) { m_dIDF.Resize ( m_iMaxQpos+1 ); // [MaxUniqQpos, MaxQpos] range will be all 0, but anyway m_dIDF.Fill ( 0.0f ); m_dTF.Resize ( m_iMaxQpos+1 ); m_dTF.Fill ( 0 ); m_iQueryWordCount = 0; m_tKeywords.Init ( m_iMaxQpos+1 ); // will not be tracking dupes bool bGotExpanded = false; hQwords.IterateStart(); while ( hQwords.IterateNext() ) { // tricky bit // for query_word_count, we only want to count keywords that are not (!) excluded by the query // that is, in (aa NOT bb) case, we want a value of 1, not 2 // there might be tail excluded terms these not affected MaxQpos if ( hQwords.IterateGet().m_bExcluded ) continue; const int iQueryPos = hQwords.IterateGet().m_iQueryPos; bool bQposUsed = m_tKeywords.BitGet ( iQueryPos ); bGotExpanded |= bQposUsed; m_iQueryWordCount += ( bQposUsed ? 0 : 1 ); // count only one term at that position m_tKeywords.BitSet ( iQueryPos ); // just to assert at early stage! m_dIDF [ iQueryPos ] += hQwords.IterateGet().m_fIDF; m_dTF [ iQueryPos ]++; } // FIXME!!! average IDF for expanded terms (aot morphology or dict=keywords) if ( bGotExpanded ) ARRAY_FOREACH ( i, m_dTF ) { if ( m_dTF[i]>1 ) m_dIDF[i] /= m_dTF[i]; } m_dTF.Fill ( 0 ); } /// finalize per-document factors that, well, need finalization void FinalizeDocFactors ( const CSphMatch & tMatch ) { m_uDocBM25 = tMatch.m_iWeight; for ( int i=0; i m_dMaxIDF[i] ) m_dMinIDF[i] = m_dMaxIDF[i] = 0; // must be FLT_MAX vs -FLT_MAX, aka no hits } m_uDocWordCount = sphBitCount ( m_uDocWordCount ); // compute real BM25 // with blackjack, and hookers, and field lengths, and parameters // // canonical idf = log ( (N-n+0.5) / (n+0.5) ) // sphinx idf = log ( (N-n+1) / n ) // and we also downscale our idf by 1/log(N+1) to map it into [-0.5, 0.5] range // compute document length float dl = 0; // OPTIMIZE? could precompute and store total dl in attrs, but at a storage cost CSphAttrLocator tLoc = m_tFieldLensLoc; if ( tLoc.m_iBitOffset>=0 ) for ( int i=0; i struct Expr_FieldFactor_c : public ISphExpr { const int * m_pIndex; const T * m_pData; Expr_FieldFactor_c ( const int * pIndex, const T * pData ) : m_pIndex ( pIndex ) , m_pData ( pData ) {} float Eval ( const CSphMatch & ) const { return (float) m_pData [ *m_pIndex ]; } int IntEval ( const CSphMatch & ) const { return (int) m_pData [ *m_pIndex ]; } }; /// bitmask field factor specialization template<> struct Expr_FieldFactor_c : public ISphExpr { const int * m_pIndex; const DWORD * m_pData; Expr_FieldFactor_c ( const int * pIndex, const DWORD * pData ) : m_pIndex ( pIndex ) , m_pData ( pData ) {} float Eval ( const CSphMatch & ) const { return (float)( (*m_pData) >> (*m_pIndex) ); } int IntEval ( const CSphMatch & ) const { return (int)( (*m_pData) >> (*m_pIndex) ); } }; /// generic per-document int factor struct Expr_IntPtr_c : public ISphExpr { DWORD * m_pVal; explicit Expr_IntPtr_c ( DWORD * pVal ) : m_pVal ( pVal ) {} float Eval ( const CSphMatch & ) const { return (float)*m_pVal; } int IntEval ( const CSphMatch & ) const { return (int)*m_pVal; } }; /// per-document field mask factor struct Expr_FieldMask_c : public ISphExpr { const CSphBitvec & m_tFieldMask; explicit Expr_FieldMask_c ( const CSphBitvec & tFieldMask ) : m_tFieldMask ( tFieldMask ) {} float Eval ( const CSphMatch & ) const { return (float)*m_tFieldMask.Begin(); } int IntEval ( const CSphMatch & ) const { return (int)*m_tFieldMask.Begin(); } }; /// generic per-document float factor struct Expr_FloatPtr_c : public ISphExpr { float * m_pVal; explicit Expr_FloatPtr_c ( float * pVal ) : m_pVal ( pVal ) {} float Eval ( const CSphMatch & ) const { return (float)*m_pVal; } int IntEval ( const CSphMatch & ) const { return (int)*m_pVal; } }; template < bool NEED_PACKEDFACTORS, bool HANDLE_DUPES > struct Expr_BM25F_T : public ISphExpr { RankerState_Expr_fn * m_pState; float m_fK1; float m_fB; float m_fWeightedAvgDocLen; CSphVector m_dWeights; ///< per field weights explicit Expr_BM25F_T ( RankerState_Expr_fn * pState, float k1, float b, ISphExpr * pFieldWeights ) { // bind k1, b m_pState = pState; m_fK1 = k1; m_fB = b; // bind weights m_dWeights.Resize ( pState->m_iFields ); m_dWeights.Fill ( 1 ); if ( pFieldWeights ) { Expr_ConstHash_c * pConstHash = dynamic_cast ( pFieldWeights ); assert ( pConstHash ); ARRAY_FOREACH ( i, pConstHash->m_dValues ) { // FIXME? report errors if field was not found? CSphString & sField = pConstHash->m_dValues[i].m_sName; int iField = pState->m_pSchema->GetFieldIndex ( sField.cstr() ); if ( iField>=0 ) m_dWeights[iField] = pConstHash->m_dValues[i].m_iValue; } } // compute weighted avgdl m_fWeightedAvgDocLen = 0; if ( m_pState->m_pFieldLens ) ARRAY_FOREACH ( i, m_dWeights ) m_fWeightedAvgDocLen += m_pState->m_pFieldLens[i] * m_dWeights[i]; else m_fWeightedAvgDocLen = 1.0f; m_fWeightedAvgDocLen /= m_pState->m_iTotalDocuments; } float Eval ( const CSphMatch & tMatch ) const { // compute document length // OPTIMIZE? could precompute and store total dl in attrs, but at a storage cost // OPTIMIZE? could at least share between multiple BM25F instances, if there are many float dl = 0; CSphAttrLocator tLoc = m_pState->m_tFieldLensLoc; if ( tLoc.m_iBitOffset>=0 ) for ( int i=0; im_iFields; i++ ) { dl += tMatch.GetAttr ( tLoc ) * m_dWeights[i]; tLoc.m_iBitOffset += 32; } // compute (the current instance of) BM25F float fRes = 0.0f; for ( int iWord=1; iWord<=m_pState->m_iMaxQpos; iWord++ ) { if ( m_pState->IsTermSkipped ( iWord ) ) continue; // compute weighted TF float tf = 0.0f; for ( int i=0; im_iFields; i++ ) tf += m_pState->m_dFieldTF [ iWord + i*(1+m_pState->m_iMaxQpos) ] * m_dWeights[i]; float idf = m_pState->m_dIDF[iWord]; // FIXME? zeroed out for dupes! fRes += tf / (tf + m_fK1*(1.0f - m_fB + m_fB*dl/m_fWeightedAvgDocLen)) * idf; } return fRes + 0.5f; // map to [0..1] range } }; /// function that sums sub-expressions over matched fields template < bool NEED_PACKEDFACTORS, bool HANDLE_DUPES > struct Expr_Sum_T : public ISphExpr { RankerState_Expr_fn * m_pState; ISphExpr * m_pArg; Expr_Sum_T ( RankerState_Expr_fn * pState, ISphExpr * pArg ) : m_pState ( pState ) , m_pArg ( pArg ) {} virtual ~Expr_Sum_T() { SafeRelease ( m_pArg ); } float Eval ( const CSphMatch & tMatch ) const { m_pState->m_iCurrentField = 0; float fRes = 0; const CSphBitvec & tFields = m_pState->m_tMatchedFields; int iBits = tFields.BitCount(); while ( iBits ) { if ( tFields.BitGet ( m_pState->m_iCurrentField ) ) { fRes += m_pArg->Eval ( tMatch ); iBits--; } m_pState->m_iCurrentField++; } return fRes; } int IntEval ( const CSphMatch & tMatch ) const { m_pState->m_iCurrentField = 0; int iRes = 0; const CSphBitvec & tFields = m_pState->m_tMatchedFields; int iBits = tFields.BitCount(); while ( iBits ) { if ( tFields.BitGet ( m_pState->m_iCurrentField ) ) { iRes += m_pArg->IntEval ( tMatch ); iBits--; } m_pState->m_iCurrentField++; } return iRes; } }; // FIXME! cut/pasted from sphinxexpr; remove dupe struct Expr_GetIntConst_c : public ISphExpr { int m_iValue; explicit Expr_GetIntConst_c ( int iValue ) : m_iValue ( iValue ) {} virtual float Eval ( const CSphMatch & ) const { return (float) m_iValue; } // no assert() here cause generic float Eval() needs to work even on int-evaluator tree virtual int IntEval ( const CSphMatch & ) const { return m_iValue; } virtual int64_t Int64Eval ( const CSphMatch & ) const { return m_iValue; } }; /// hook that exposes field-level factors, document-level factors, and matched field SUM() function to generic expressions template < bool NEED_PACKEDFACTORS, bool HANDLE_DUPES > class ExprRankerHook_T : public ISphExprHook { public: RankerState_Expr_fn * m_pState; const char * m_sCheckError; bool m_bCheckInFieldAggr; public: explicit ExprRankerHook_T ( RankerState_Expr_fn * pState ) : m_pState ( pState ) , m_sCheckError ( NULL ) , m_bCheckInFieldAggr ( false ) {} int IsKnownIdent ( const char * sIdent ) { // OPTIMIZE? hash this some nice long winter night? if ( !strcasecmp ( sIdent, "lcs" ) ) return XRANK_LCS; if ( !strcasecmp ( sIdent, "user_weight" ) ) return XRANK_USER_WEIGHT; if ( !strcasecmp ( sIdent, "hit_count" ) ) return XRANK_HIT_COUNT; if ( !strcasecmp ( sIdent, "word_count" ) ) return XRANK_WORD_COUNT; if ( !strcasecmp ( sIdent, "tf_idf" ) ) return XRANK_TF_IDF; if ( !strcasecmp ( sIdent, "min_idf" ) ) return XRANK_MIN_IDF; if ( !strcasecmp ( sIdent, "max_idf" ) ) return XRANK_MAX_IDF; if ( !strcasecmp ( sIdent, "sum_idf" ) ) return XRANK_SUM_IDF; if ( !strcasecmp ( sIdent, "min_hit_pos" ) ) return XRANK_MIN_HIT_POS; if ( !strcasecmp ( sIdent, "min_best_span_pos" ) ) return XRANK_MIN_BEST_SPAN_POS; if ( !strcasecmp ( sIdent, "exact_hit" ) ) return XRANK_EXACT_HIT; if ( !strcasecmp ( sIdent, "bm25" ) ) return XRANK_BM25; if ( !strcasecmp ( sIdent, "max_lcs" ) ) return XRANK_MAX_LCS; if ( !strcasecmp ( sIdent, "field_mask" ) ) return XRANK_FIELD_MASK; if ( !strcasecmp ( sIdent, "query_word_count" ) ) return XRANK_QUERY_WORD_COUNT; if ( !strcasecmp ( sIdent, "doc_word_count" ) ) return XRANK_DOC_WORD_COUNT; return -1; } int IsKnownFunc ( const char * sFunc ) { if ( !strcasecmp ( sFunc, "sum" ) ) return XRANK_SUM; if ( !strcasecmp ( sFunc, "max_window_hits" ) ) return XRANK_MAX_WINDOW_HITS; if ( !strcasecmp ( sFunc, "bm25a" ) ) return XRANK_BM25A; if ( !strcasecmp ( sFunc, "bm25f" ) ) return XRANK_BM25F; return -1; } ISphExpr * CreateNode ( int iID, ISphExpr * pLeft, ESphEvalStage * ) { int * pCF = &m_pState->m_iCurrentField; // just a shortcut switch ( iID ) { case XRANK_LCS: return new Expr_FieldFactor_c ( pCF, m_pState->m_uLCS ); case XRANK_USER_WEIGHT: return new Expr_FieldFactor_c ( pCF, m_pState->m_pWeights ); case XRANK_HIT_COUNT: return new Expr_FieldFactor_c ( pCF, m_pState->m_uHitCount ); case XRANK_WORD_COUNT: return new Expr_FieldFactor_c ( pCF, m_pState->m_uWordCount ); case XRANK_TF_IDF: return new Expr_FieldFactor_c ( pCF, m_pState->m_dTFIDF ); case XRANK_MIN_IDF: return new Expr_FieldFactor_c ( pCF, m_pState->m_dMinIDF ); case XRANK_MAX_IDF: return new Expr_FieldFactor_c ( pCF, m_pState->m_dMaxIDF ); case XRANK_SUM_IDF: return new Expr_FieldFactor_c ( pCF, m_pState->m_dSumIDF ); case XRANK_MIN_HIT_POS: return new Expr_FieldFactor_c ( pCF, m_pState->m_iMinHitPos ); case XRANK_MIN_BEST_SPAN_POS: return new Expr_FieldFactor_c ( pCF, m_pState->m_iMinBestSpanPos ); case XRANK_EXACT_HIT: return new Expr_FieldFactor_c ( pCF, &m_pState->m_uExactHit ); case XRANK_MAX_WINDOW_HITS: { CSphMatch tDummy; m_pState->m_iWindowSize = pLeft->IntEval ( tDummy ); // must be constant; checked in GetReturnType() SafeRelease ( pLeft ); return new Expr_FieldFactor_c ( pCF, m_pState->m_iMaxWindowHits ); } case XRANK_BM25: return new Expr_IntPtr_c ( &m_pState->m_uDocBM25 ); case XRANK_MAX_LCS: return new Expr_GetIntConst_c ( m_pState->m_iMaxLCS ); case XRANK_FIELD_MASK: return new Expr_FieldMask_c ( m_pState->m_tMatchedFields ); case XRANK_QUERY_WORD_COUNT: return new Expr_GetIntConst_c ( m_pState->m_iQueryWordCount ); case XRANK_DOC_WORD_COUNT: return new Expr_IntPtr_c ( &m_pState->m_uDocWordCount ); case XRANK_BM25A: { // exprs we'll evaluate here must be constant; that is checked in GetReturnType() // so having a dummy match with no data work alright assert ( pLeft->IsArglist() ); CSphMatch tDummy; m_pState->m_fParamK1 = pLeft->GetArg(0)->Eval ( tDummy ); m_pState->m_fParamB = pLeft->GetArg(1)->Eval ( tDummy ); m_pState->m_fParamK1 = Max ( m_pState->m_fParamK1, 0.001f ); m_pState->m_fParamB = Min ( Max ( m_pState->m_fParamB, 0.0f ), 1.0f ); SafeDelete ( pLeft ); return new Expr_FloatPtr_c ( &m_pState->m_fDocBM25A ); } case XRANK_BM25F: { assert ( pLeft->IsArglist() ); CSphMatch tDummy; float fK1 = pLeft->GetArg(0)->Eval ( tDummy ); float fB = pLeft->GetArg(1)->Eval ( tDummy ); fK1 = Max ( fK1, 0.001f ); fB = Min ( Max ( fB, 0.0f ), 1.0f ); ISphExpr * pRes = new Expr_BM25F_T ( m_pState, fK1, fB, pLeft->GetArg(2) ); SafeDelete ( pLeft ); return pRes; } case XRANK_SUM: return new Expr_Sum_T ( m_pState, pLeft ); default: return NULL; } } ESphAttr GetIdentType ( int iID ) { switch ( iID ) { case XRANK_LCS: // field-level case XRANK_USER_WEIGHT: case XRANK_HIT_COUNT: case XRANK_WORD_COUNT: case XRANK_MIN_HIT_POS: case XRANK_MIN_BEST_SPAN_POS: case XRANK_EXACT_HIT: case XRANK_MAX_WINDOW_HITS: case XRANK_BM25: // doc-level case XRANK_MAX_LCS: case XRANK_FIELD_MASK: case XRANK_QUERY_WORD_COUNT: case XRANK_DOC_WORD_COUNT: return SPH_ATTR_INTEGER; case XRANK_TF_IDF: case XRANK_MIN_IDF: case XRANK_MAX_IDF: case XRANK_SUM_IDF: return SPH_ATTR_FLOAT; default: assert ( 0 ); return SPH_ATTR_INTEGER; } } /// helper to check argument types by a signature string (passed in sArgs) /// every character in the signature describes a type /// ? = any type /// i = integer /// I = integer constant /// f = float /// s = scalar (int/float) /// h = hash /// signature can also be preceded by "c:" modifier than means that all arguments must be constant bool CheckArgtypes ( const CSphVector & dArgs, const char * sFuncname, const char * sArgs, bool bAllConst, CSphString & sError ) { if ( sArgs[0]=='c' && sArgs[1]==':' ) { if ( !bAllConst ) { sError.SetSprintf ( "%s() requires constant arguments", sFuncname ); return false; } sArgs += 2; } int iLen = strlen ( sArgs ); if ( dArgs.GetLength()!=iLen ) { sError.SetSprintf ( "%s() requires %d argument(s), not %d", sFuncname, iLen, dArgs.GetLength() ); return false; } ARRAY_FOREACH ( i, dArgs ) { switch ( *sArgs++ ) { case '?': break; case 'i': if ( dArgs[i]!=SPH_ATTR_INTEGER ) { sError.SetSprintf ( "argument %d to %s() must be integer", i, sFuncname ); return false; } break; case 's': if ( dArgs[i]!=SPH_ATTR_INTEGER && dArgs[i]!=SPH_ATTR_FLOAT ) { sError.SetSprintf ( "argument %d to %s() must be scalar (integer or float)", i, sFuncname ); return false; } break; case 'h': if ( dArgs[i]!=SPH_ATTR_CONSTHASH ) { sError.SetSprintf ( "argument %d to %s() must be a hash of constants", i, sFuncname ); return false; } break; default: assert ( 0 && "unknown signature code" ); break; } } // this is important! // other previous failed checks might have filled sError // and if anything else up the stack checks it, we need an empty message now sError = ""; return true; } ESphAttr GetReturnType ( int iID, const CSphVector & dArgs, bool bAllConst, CSphString & sError ) { switch ( iID ) { case XRANK_SUM: if ( !CheckArgtypes ( dArgs, "SUM", "?", bAllConst, sError ) ) return SPH_ATTR_NONE; return dArgs[0]; case XRANK_MAX_WINDOW_HITS: if ( !CheckArgtypes ( dArgs, "MAX_WINDOW_HITS", "c:i", bAllConst, sError ) ) return SPH_ATTR_NONE; return SPH_ATTR_INTEGER; case XRANK_BM25A: if ( !CheckArgtypes ( dArgs, "BM25A", "c:ss", bAllConst, sError ) ) return SPH_ATTR_NONE; return SPH_ATTR_FLOAT; case XRANK_BM25F: if ( !CheckArgtypes ( dArgs, "BM25F", "c:ss", bAllConst, sError ) ) if ( !CheckArgtypes ( dArgs, "BM25F", "c:ssh", bAllConst, sError ) ) return SPH_ATTR_NONE; return SPH_ATTR_FLOAT; default: sError.SetSprintf ( "internal error: unknown hook function (id=%d)", iID ); } return SPH_ATTR_NONE; } void CheckEnter ( int iID ) { if ( !m_sCheckError ) switch ( iID ) { case XRANK_LCS: case XRANK_USER_WEIGHT: case XRANK_HIT_COUNT: case XRANK_WORD_COUNT: case XRANK_TF_IDF: case XRANK_MIN_IDF: case XRANK_MAX_IDF: case XRANK_SUM_IDF: case XRANK_MIN_HIT_POS: case XRANK_MIN_BEST_SPAN_POS: case XRANK_EXACT_HIT: case XRANK_MAX_WINDOW_HITS: if ( !m_bCheckInFieldAggr ) m_sCheckError = "field factors must only occur withing field aggregates in ranking expression"; break; case XRANK_SUM: if ( m_bCheckInFieldAggr ) m_sCheckError = "field aggregates can not be nested in ranking expression"; else m_bCheckInFieldAggr = true; break; default: assert ( iID>=0 ); return; } } void CheckExit ( int iID ) { if ( !m_sCheckError && iID==XRANK_SUM ) { assert ( m_bCheckInFieldAggr ); m_bCheckInFieldAggr = false; } } }; /// ctor template < bool NEED_PACKEDFACTORS, bool HANDLE_DUPES > RankerState_Expr_fn ::RankerState_Expr_fn () : m_pWeights ( NULL ) , m_sExpr ( NULL ) , m_pExpr ( NULL ) , m_iMaxMatches ( 0 ) , m_iMaxLCS ( 0 ) , m_iQueryWordCount ( 0 ) {} /// dtor template < bool NEED_PACKEDFACTORS, bool HANDLE_DUPES > RankerState_Expr_fn ::~RankerState_Expr_fn () { SafeRelease ( m_pExpr ); } /// initialize ranker state template < bool NEED_PACKEDFACTORS, bool HANDLE_DUPES > bool RankerState_Expr_fn::Init ( int iFields, const int * pWeights, ExtRanker_c * pRanker, CSphString & sError ) { // cleanup factors memset ( m_uLCS, 0, sizeof(m_uLCS) ); memset ( m_uMatchMask, 0, sizeof(m_uMatchMask) ); m_uCurLCS = 0; if ( HANDLE_DUPES ) { m_uCurPos = 0; m_uLcsTailPos = 0; m_uLcsTailQposMask = 0; m_uCurQposMask = 0; } m_iExpDelta = -INT_MAX; m_iFields = iFields; m_pWeights = pWeights; m_uDocBM25 = 0; m_tMatchedFields.Init ( iFields ); m_iCurrentField = 0; memset ( m_uHitCount, 0, sizeof(m_uHitCount) ); memset ( m_uWordCount, 0, sizeof(m_uWordCount) ); memset ( m_dTFIDF, 0, sizeof(m_dTFIDF) ); memset ( m_dSumIDF, 0, sizeof(m_dSumIDF) ); memset ( m_iMinHitPos, 0, sizeof(m_iMinHitPos) ); memset ( m_iMinBestSpanPos, 0, sizeof(m_iMinBestSpanPos) ); memset ( m_iMaxWindowHits, 0, sizeof(m_iMaxWindowHits) ); m_iMaxQpos = pRanker->m_iMaxQpos; // already copied in SetQwords, but anyway m_uExactHit = 0; m_uDocWordCount = 0; m_iWindowSize = 1; for ( int i=0; i < SPH_MAX_FIELDS; i++ ) { m_dMinIDF[i] = FLT_MAX; m_dMaxIDF[i] = -FLT_MAX; } // compute query level constants // max_lcs, aka m_iMaxLCS (for matchany ranker emulation) gets computed here // query_word_count, aka m_iQueryWordCount is set elsewhere (in SetQwordsIDF()) m_iMaxLCS = 0; for ( int i=0; im_iQwords; for ( int i=0; iGetAttrsCount(); i++ ) { if ( m_pSchema->GetAttr(i).m_eAttrType!=SPH_ATTR_TOKENCOUNT ) continue; m_tFieldLensLoc = m_pSchema->GetAttr(i).m_tLocator; break; } m_fAvgDocLen = 0.0f; m_pFieldLens = pRanker->GetIndex()->GetFieldLens(); if ( m_pFieldLens ) for ( int i=0; iGetIndex()->GetStats().m_iTotalDocuments; m_fAvgDocLen /= m_iTotalDocuments; m_fParamK1 = 1.2f; m_fParamB = 0.75f; // not in SetQwords, because we only get iFields here m_dFieldTF.Resize ( m_iFields*(m_iMaxQpos+1) ); m_dFieldTF.Fill ( 0 ); // parse expression bool bUsesWeight; ExprRankerHook_T tHook ( this ); m_pExpr = sphExprParse ( m_sExpr, *m_pSchema, &m_eExprType, &bUsesWeight, sError, NULL, NULL, &tHook ); // FIXME!!! profile UDF here too if ( !m_pExpr ) return false; if ( m_eExprType!=SPH_ATTR_INTEGER && m_eExprType!=SPH_ATTR_FLOAT ) { sError = "ranking expression must evaluate to integer or float"; return false; } if ( bUsesWeight ) { sError = "ranking expression must not refer to WEIGHT()"; return false; } if ( tHook.m_sCheckError ) { sError = tHook.m_sCheckError; return false; } // all seems ok return true; } #if USE_WINDOWS #pragma warning(disable:4127) // conditional expr is const for MSVC #endif /// process next hit, update factors template < bool NEED_PACKEDFACTORS, bool HANDLE_DUPES > void RankerState_Expr_fn::Update ( const ExtHit_t * pHlist ) { const DWORD uField = HITMAN::GetField ( pHlist->m_uHitpos ); const int iPos = HITMAN::GetPos ( pHlist->m_uHitpos ); if ( !HANDLE_DUPES ) { // update LCS int iDelta = HITMAN::GetLCS ( pHlist->m_uHitpos )-pHlist->m_uQuerypos; if ( iDelta==m_iExpDelta ) { m_uCurLCS = m_uCurLCS+BYTE ( pHlist->m_uWeight ); if ( HITMAN::IsEnd ( pHlist->m_uHitpos ) && (int)pHlist->m_uQuerypos==m_iMaxQpos && iPos==m_iMaxQpos ) m_uExactHit |= ( 1UL << uField ); } else { m_uCurLCS = BYTE ( pHlist->m_uWeight ); if ( iPos==1 && HITMAN::IsEnd ( pHlist->m_uHitpos ) && m_iMaxQpos==1 ) m_uExactHit |= ( 1UL << uField ); } if ( m_uCurLCS>m_uLCS [ uField ] ) { m_uLCS [ uField ] = m_uCurLCS; // for first hit in field just use current position as min_best_span_pos // else adjust position with current lcs if ( !m_iMinBestSpanPos [ uField ] ) m_iMinBestSpanPos [ uField ] = iPos; else m_iMinBestSpanPos [ uField ] = iPos - m_uCurLCS + 1; } m_iExpDelta = iDelta + pHlist->m_uSpanlen - 1; } else { // reset accumulated data from previous field if ( (DWORD)HITMAN::GetField ( m_uCurPos )!=uField ) m_uCurQposMask = 0; DWORD uPos = HITMAN::GetLCS ( pHlist->m_uHitpos ); if ( (DWORD)uPos!=m_uCurPos ) { // next new and shiny hitpos in line // FIXME!? what do we do with longer spans? keep looking? reset? if ( m_uCurLCS<2 ) { m_uLcsTailPos = m_uCurPos; m_uLcsTailQposMask = m_uCurQposMask; m_uCurLCS = 1; } m_uCurQposMask = 0; m_uCurPos = uPos; if ( m_uLCS [ uField ]m_uWeight ) { m_uLCS [ uField ] = BYTE ( pHlist->m_uWeight ); m_iMinBestSpanPos [ uField ] = iPos; } } // add that qpos to current qpos mask (for the current hitpos) m_uCurQposMask |= ( 1UL << pHlist->m_uQuerypos ); // and check if that results in a better lcs match now int iDelta = ( m_uCurPos-m_uLcsTailPos ); if ( ( m_uCurQposMask >> iDelta ) & m_uLcsTailQposMask ) { // cool, it matched! m_uLcsTailQposMask = ( 1UL << pHlist->m_uQuerypos ); // our lcs span now ends with a specific qpos m_uLcsTailPos = m_uCurPos; // and in a specific position m_uCurLCS = BYTE ( m_uCurLCS+pHlist->m_uWeight ); // and it's longer m_uCurQposMask = 0; // and we should avoid matching subsequent hits on the same hitpos // update per-field vector if ( m_uCurLCS>m_uLCS [ uField ] ) { m_uLCS [ uField ] = m_uCurLCS; m_iMinBestSpanPos [ uField ] = iPos - m_uCurLCS + 1; } } if ( iDelta==m_iExpDelta ) { if ( HITMAN::IsEnd ( pHlist->m_uHitpos ) && (int)pHlist->m_uQuerypos==m_iMaxQpos && iPos==m_iMaxQpos ) m_uExactHit |= ( 1UL << uField ); } else { if ( iPos==1 && HITMAN::IsEnd ( pHlist->m_uHitpos ) && m_iMaxQpos==1 ) m_uExactHit |= ( 1UL << uField ); } m_iExpDelta = iDelta + pHlist->m_uSpanlen - 1; } // update other stuff m_uMatchMask[uField] |= ( 1<<(pHlist->m_uQuerypos-1) ); m_tMatchedFields.BitSet ( uField ); // keywords can be duplicated in the query, so we need this extra check WORD uQpos = pHlist->m_uQuerypos; bool bUniq = m_tKeywords.BitGet ( pHlist->m_uQuerypos ); if ( HANDLE_DUPES && bUniq ) { uQpos = m_dTermDupes [ uQpos ]; bUniq = ( m_dTermsHit[uQpos]!=pHlist->m_uHitpos ); m_dTermsHit[uQpos] = pHlist->m_uHitpos; } if ( bUniq ) { float fIDF = m_dIDF [ uQpos ]; DWORD uHitPosMask = 1< m_dMaxIDF[uField] ) m_dMaxIDF[uField] = fIDF; m_uHitCount[uField]++; m_uWordCount[uField] |= uHitPosMask; m_uDocWordCount |= uHitPosMask; m_dTFIDF[uField] += fIDF; // avoid duplicate check for BM25A, BM25F though // (that sort of automatically accounts for qtf factor) m_dTF [ uQpos ]++; m_dFieldTF [ uField*(1+m_iMaxQpos) + uQpos ]++; } if ( !m_iMinHitPos[uField] ) m_iMinHitPos[uField] = iPos; // update hit window, max_window_hits factor if ( m_dWindow.GetLength() ) { // sorted_remove_if ( _1 + winsize <= hitpos ) ) int i = 0; while ( im_uHitpos ) i++; for ( int j=0; jm_uHitpos ); m_iMaxWindowHits[uField] = Max ( m_iMaxWindowHits[uField], m_dWindow.GetLength() ); } #if USE_WINDOWS #pragma warning(default:4127) // conditional expr is const for MSVC #endif template < bool NEED_PACKEDFACTORS, bool HANDLE_DUPES > BYTE * RankerState_Expr_fn::PackFactors ( int * pSize ) { DWORD * pPackStart = NULL; if ( pSize ) { const DWORD MAX_PACKED_SIZE=2048; pPackStart = new DWORD [MAX_PACKED_SIZE]; } else { pPackStart = (DWORD *)m_tFactorPool.Alloc(); assert ( pPackStart ); } DWORD * pPack = pPackStart; // leave space for size pPack++; // document level factors *pPack++ = m_uDocBM25; *pPack++ = *(DWORD*)&m_fDocBM25A; *pPack++ = *m_tMatchedFields.Begin(); *pPack++ = m_uDocWordCount; // field level factors *pPack++ = (DWORD)m_iFields; for ( int i=0; i>i ) & 1; *pPack++ = (DWORD)m_iMaxWindowHits[i]; } } // word level factors *pPack++ = (DWORD)m_iMaxQpos; for ( int i=1; i<=m_iMaxQpos; i++ ) { DWORD uKeywordMask = !IsTermSkipped(i); *pPack++ = uKeywordMask; if ( uKeywordMask || pSize ) { *pPack++ = (DWORD)i; *pPack++ = (DWORD)m_dTF[i]; *pPack++ = *(DWORD*)&m_dIDF[i]; } } // m_dFieldTF = iWord + iField * ( 1 + iWordsCount ) // FIXME! pack these sparse factors ( however these should fit into fixed-size FactorPool block ) *pPack++ = m_dFieldTF.GetLength(); if ( !pSize ) memcpy ( pPack, m_dFieldTF.Begin(), m_dFieldTF.GetLength()*sizeof(m_dFieldTF[0]) ); pPack += m_dFieldTF.GetLength(); *pPackStart = (pPack-pPackStart)*sizeof(DWORD); if ( pSize ) { *pSize = (pPack-pPackStart)*sizeof(DWORD); delete [] pPackStart; return NULL; } assert ( (pPack-pPackStart)*sizeof(DWORD)<=(DWORD)m_tFactorPool.GetElementSize() ); return (BYTE*)pPackStart; } #if USE_WINDOWS #pragma warning(disable:4127) // conditional expr is const for MSVC #endif template bool RankerState_Expr_fn::ExtraDataImpl ( ExtraData_e eType, void ** ppResult ) { if ( eType==EXTRA_SET_MVAPOOL || eType==EXTRA_SET_STRINGPOOL || NEED_PACKEDFACTORS ) { switch ( eType ) { case EXTRA_SET_MVAPOOL: m_pExpr->Command ( SPH_EXPR_SET_MVA_POOL, (DWORD*)ppResult ); return true; case EXTRA_SET_STRINGPOOL: m_pExpr->Command ( SPH_EXPR_SET_STRING_POOL, (BYTE*)ppResult ); return true; case EXTRA_SET_MAXMATCHES: m_iMaxMatches = *(int*)ppResult; return true; case EXTRA_SET_MATCHPUSHED: m_tFactorPool.AddRef ( *(SphDocID_t*)ppResult ); return true; case EXTRA_SET_MATCHPOPPED: { const CSphTightVector & dReleased = *(CSphTightVector*)ppResult; ARRAY_FOREACH ( i, dReleased ) m_tFactorPool.Release ( dReleased[i] ); } return true; case EXTRA_GET_DATA_PACKEDFACTORS: *ppResult = m_tFactorPool.GetHashPtr(); return true; case EXTRA_GET_DATA_RANKER_STATE: { SphExtraDataRankerState_t * pState = (SphExtraDataRankerState_t *)ppResult; pState->m_iFields = m_iFields; pState->m_pSchema = m_pSchema; pState->m_pFieldLens = m_pFieldLens; pState->m_iTotalDocuments = m_iTotalDocuments; pState->m_tFieldLensLoc = m_tFieldLensLoc; pState->m_iMaxQpos = m_iMaxQpos; } return true; default: return false; } } else return false; } /// finish document processing, compute weight from factors template < bool NEED_PACKEDFACTORS, bool HANDLE_DUPES > DWORD RankerState_Expr_fn::Finalize ( const CSphMatch & tMatch ) { #ifndef NDEBUG // sanity check for ( int i=0; iIntEval ( tMatch ) : (DWORD)m_pExpr->Eval ( tMatch ); if ( HANDLE_DUPES ) { m_uCurPos = 0; m_uLcsTailPos = 0; m_uLcsTailQposMask = 0; m_uCurQposMask = 0; } // cleanup ResetDocFactors(); // done return uRes; } template bool RankerState_Expr_fn::IsTermSkipped ( int iTerm ) { assert ( iTerm>=0 && iTerm class ExtRanker_Expr_T : public ExtRanker_T< RankerState_Expr_fn > { public: ExtRanker_Expr_T ( const XQQuery_t & tXQ, const ISphQwordSetup & tSetup, const char * sExpr, const CSphSchema & tSchema ) : ExtRanker_T< RankerState_Expr_fn > ( tXQ, tSetup ) { // tricky bit, we stash the pointer to expr here, but it will be parsed // somewhat later during InitState() call, when IDFs etc are computed this->m_tState.m_sExpr = sExpr; this->m_tState.m_pSchema = &tSchema; } void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) { ExtRanker_T< RankerState_Expr_fn >::SetQwordsIDF ( hQwords ); this->m_tState.m_iMaxQpos = this->m_iMaxQpos; this->m_tState.SetQwords ( hQwords ); } virtual int GetMatches () { if ( NEED_PACKEDFACTORS ) this->m_tState.FlushMatches (); return ExtRanker_T >::GetMatches (); } virtual void SetTermDupes ( const ExtQwordsHash_t & hQwords, int iMaxQpos ) { if ( !this->m_pRoot ) return; this->m_tState.m_dTermDupes.Resize ( iMaxQpos + 1 ); this->m_tState.m_dTermsHit.Resize ( iMaxQpos + 1 ); this->m_pRoot->GetTermDupes ( hQwords, this->m_tState.m_dTermDupes ); this->m_tState.m_dTermsHit.Fill ( EMPTY_HIT ); } }; #if USE_WINDOWS #pragma warning(default:4127) // conditional expr is const for MSVC #endif ////////////////////////////////////////////////////////////////////////// // EXPRESSION FACTORS EXPORT RANKER ////////////////////////////////////////////////////////////////////////// /// ranker state that computes BM25 as weight, but also all known factors for export purposes struct RankerState_Export_fn : public RankerState_Expr_fn<> { public: CSphOrderedHash < CSphString, SphDocID_t, IdentityHash_fn, 256 > m_hFactors; public: DWORD Finalize ( const CSphMatch & tMatch ) { // finalize factor computations FinalizeDocFactors ( tMatch ); // build document level factors // FIXME? should we build query level factors too? max_lcs, query_word_count, etc const int MAX_STR_LEN = 1024; CSphVector dVal; dVal.Resize ( MAX_STR_LEN ); snprintf ( dVal.Begin(), dVal.GetLength(), "bm25=%d, bm25a=%f, field_mask=%d, doc_word_count=%d", m_uDocBM25, m_fDocBM25A, *m_tMatchedFields.Begin(), m_uDocWordCount ); char sTmp[MAX_STR_LEN]; // build field level factors for ( int i=0; i>i ) & 1, m_iMaxWindowHits[i] ); int iValLen = strlen ( dVal.Begin() ); int iTotalLen = iValLen+strlen(sTmp); if ( dVal.GetLength() < iTotalLen+1 ) dVal.Resize ( iTotalLen+1 ); strcpy ( &(dVal[iValLen]), sTmp ); //NOLINT } // build word level factors for ( int i=1; i<=m_iMaxQpos; i++ ) if ( !IsTermSkipped ( i ) ) { snprintf ( sTmp, MAX_STR_LEN, ", word%d=(tf=%d, idf=%f)", i, m_dTF[i], m_dIDF[i] ); int iValLen = strlen ( dVal.Begin() ); int iTotalLen = iValLen+strlen(sTmp); if ( dVal.GetLength() < iTotalLen+1 ) dVal.Resize ( iTotalLen+1 ); strcpy ( &(dVal[iValLen]), sTmp ); //NOLINT } // export factors m_hFactors.Add ( dVal.Begin(), tMatch.m_iDocID ); // compute sorting expression now DWORD uRes = ( m_eExprType==SPH_ATTR_INTEGER ) ? m_pExpr->IntEval ( tMatch ) : (DWORD)m_pExpr->Eval ( tMatch ); // cleanup and return! ResetDocFactors(); return uRes; } virtual bool ExtraDataImpl ( ExtraData_e eType, void ** ppResult ) { if ( eType==EXTRA_GET_DATA_RANKFACTORS ) *ppResult = &m_hFactors; return true; } }; /// export ranker that emits BM25 as the weight, but computes and export all the factors /// useful for research purposes, eg. exporting the data for machine learning class ExtRanker_Export_c : public ExtRanker_T { public: ExtRanker_Export_c ( const XQQuery_t & tXQ, const ISphQwordSetup & tSetup, const char * sExpr, const CSphSchema & tSchema ) : ExtRanker_T ( tXQ, tSetup ) { m_tState.m_sExpr = sExpr; m_tState.m_pSchema = &tSchema; } void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) { ExtRanker_T::SetQwordsIDF ( hQwords ); m_tState.m_iMaxQpos = m_iMaxQpos; m_tState.SetQwords ( hQwords ); } }; ////////////////////////////////////////////////////////////////////////// // RANKER FACTORY ////////////////////////////////////////////////////////////////////////// static void CheckQueryWord ( const char * szWord, CSphQueryResult * pResult, const CSphIndexSettings & tSettings, bool bStar ) { if ( ( !tSettings.m_iMinPrefixLen && !tSettings.m_iMinInfixLen ) || !bStar || !szWord ) return; int iLen = strlen ( szWord ); bool bHeadStar = szWord[0]=='*'; bool bTailStar = szWord[iLen-1]=='*'; int iLenWOStars = iLen - ( bHeadStar ? 1 : 0 ) - ( bTailStar ? 1 : 0 ); if ( bHeadStar || bTailStar ) { if ( tSettings.m_iMinInfixLen > 0 && iLenWOStars < tSettings.m_iMinInfixLen ) pResult->m_sWarning.SetSprintf ( "Query word length is less than min infix length. word: '%s' ", szWord ); else if ( tSettings.m_iMinPrefixLen > 0 && iLenWOStars < tSettings.m_iMinPrefixLen ) pResult->m_sWarning.SetSprintf ( "Query word length is less than min prefix length. word: '%s' ", szWord ); } } static void CheckExtendedQuery ( const XQNode_t * pNode, CSphQueryResult * pResult, const CSphIndexSettings & tSettings, bool bStar ) { ARRAY_FOREACH ( i, pNode->m_dWords ) CheckQueryWord ( pNode->m_dWords[i].m_sWord.cstr(), pResult, tSettings, bStar ); ARRAY_FOREACH ( i, pNode->m_dChildren ) CheckExtendedQuery ( pNode->m_dChildren[i], pResult, tSettings, bStar ); } struct ExtQwordOrderbyQueryPos_t { bool IsLess ( const ExtQword_t * pA, const ExtQword_t * pB ) const { return pA->m_iQueryPosm_iQueryPos; } }; static bool HasQwordDupes ( XQNode_t * pNode, SmallStringHash_T & hQwords ) { ARRAY_FOREACH ( i, pNode->m_dChildren ) if ( HasQwordDupes ( pNode->m_dChildren[i], hQwords ) ) return true; ARRAY_FOREACH ( i, pNode->m_dWords ) if ( !hQwords.Add ( 1, pNode->m_dWords[i].m_sWord ) ) return true; return false; } static bool HasQwordDupes ( XQNode_t * pNode ) { SmallStringHash_T hQwords; return HasQwordDupes ( pNode, hQwords ); } ISphRanker * sphCreateRanker ( const XQQuery_t & tXQ, const CSphQuery * pQuery, CSphQueryResult * pResult, const ISphQwordSetup & tTermSetup, const CSphQueryContext & tCtx ) { // shortcut const CSphIndex * pIndex = tTermSetup.m_pIndex; // check the keywords CheckExtendedQuery ( tXQ.m_pRoot, pResult, pIndex->GetSettings(), pIndex->IsStarEnabled() ); // fill payload mask DWORD uPayloadMask = 0; ARRAY_FOREACH ( i, pIndex->GetMatchSchema().m_dFields ) uPayloadMask |= pIndex->GetMatchSchema().m_dFields[i].m_bPayload << i; bool bGotDupes = HasQwordDupes ( tXQ.m_pRoot ); // setup eval-tree ExtRanker_c * pRanker = NULL; switch ( pQuery->m_eRanker ) { case SPH_RANK_PROXIMITY_BM25: if ( uPayloadMask ) pRanker = new ExtRanker_T < RankerState_ProximityPayload_fn > ( tXQ, tTermSetup ); else if ( tXQ.m_bSingleWord ) pRanker = new ExtRanker_WeightSum_c ( tXQ, tTermSetup ); else if ( bGotDupes ) pRanker = new ExtRanker_T < RankerState_Proximity_fn > ( tXQ, tTermSetup ); else pRanker = new ExtRanker_T < RankerState_Proximity_fn > ( tXQ, tTermSetup ); break; case SPH_RANK_BM25: pRanker = new ExtRanker_WeightSum_c ( tXQ, tTermSetup ); break; case SPH_RANK_NONE: pRanker = new ExtRanker_None_c ( tXQ, tTermSetup ); break; case SPH_RANK_WORDCOUNT: pRanker = new ExtRanker_T < RankerState_Wordcount_fn > ( tXQ, tTermSetup ); break; case SPH_RANK_PROXIMITY: if ( tXQ.m_bSingleWord ) pRanker = new ExtRanker_WeightSum_c<> ( tXQ, tTermSetup ); else if ( bGotDupes ) pRanker = new ExtRanker_T < RankerState_Proximity_fn > ( tXQ, tTermSetup ); else pRanker = new ExtRanker_T < RankerState_Proximity_fn > ( tXQ, tTermSetup ); break; case SPH_RANK_MATCHANY: pRanker = new ExtRanker_T < RankerState_MatchAny_fn > ( tXQ, tTermSetup ); break; case SPH_RANK_FIELDMASK: pRanker = new ExtRanker_T < RankerState_Fieldmask_fn > ( tXQ, tTermSetup ); break; case SPH_RANK_SPH04: pRanker = new ExtRanker_T < RankerState_ProximityBM25Exact_fn > ( tXQ, tTermSetup ); break; case SPH_RANK_EXPR: if ( tCtx.m_bPackedFactors && bGotDupes ) pRanker = new ExtRanker_Expr_T ( tXQ, tTermSetup, pQuery->m_sRankerExpr.cstr(), pIndex->GetMatchSchema() ); else if ( tCtx.m_bPackedFactors && !bGotDupes ) pRanker = new ExtRanker_Expr_T ( tXQ, tTermSetup, pQuery->m_sRankerExpr.cstr(), pIndex->GetMatchSchema() ); else if ( !tCtx.m_bPackedFactors && bGotDupes ) pRanker = new ExtRanker_Expr_T ( tXQ, tTermSetup, pQuery->m_sRankerExpr.cstr(), pIndex->GetMatchSchema() ); else if ( !tCtx.m_bPackedFactors && !bGotDupes ) pRanker = new ExtRanker_Expr_T ( tXQ, tTermSetup, pQuery->m_sRankerExpr.cstr(), pIndex->GetMatchSchema() ); break; case SPH_RANK_EXPORT: pRanker = new ExtRanker_Export_c ( tXQ, tTermSetup, pQuery->m_sRankerExpr.cstr(), pIndex->GetMatchSchema() ); break; default: pResult->m_sWarning.SetSprintf ( "unknown ranking mode %d; using default", (int)pQuery->m_eRanker ); if ( bGotDupes ) pRanker = new ExtRanker_T < RankerState_Proximity_fn > ( tXQ, tTermSetup ); else pRanker = new ExtRanker_T < RankerState_Proximity_fn > ( tXQ, tTermSetup ); break; } assert ( pRanker ); pRanker->m_uPayloadMask = uPayloadMask; // setup IDFs ExtQwordsHash_t hQwords; int iMaxQpos = pRanker->GetQwords ( hQwords ); const int iQwords = hQwords.GetLength (); const CSphSourceStats & tSourceStats = pIndex->GetStats(); CSphVector dWords; dWords.Reserve ( hQwords.GetLength() ); hQwords.IterateStart (); while ( hQwords.IterateNext() ) { ExtQword_t & tWord = hQwords.IterateGet (); // build IDF float fIDF = 0.0f; if ( pQuery->m_bGlobalIDF ) fIDF = pIndex->GetGlobalIDF ( tWord.m_sWord, tWord.m_iDocs, iQwords, pQuery->m_bPlainIDF ); else if ( tWord.m_iDocs ) { // (word_docs > total_docs) case *is* occasionally possible // because of dupes, or delayed purging in RT, etc // FIXME? we don't expect over 4G docs per just 1 local index const int64_t iTotalClamped = Max ( tSourceStats.m_iTotalDocuments, int64_t(tWord.m_iDocs) ); if ( !pQuery->m_bPlainIDF ) { // bm25 variant, idf = log((N-n+1)/n), as per Robertson et al // // idf \in [-log(N), log(N)] // weight \in [-NumWords*log(N), NumWords*log(N)] // we want weight \in [0, 1] range // we prescale idfs and get weight \in [-0.5, 0.5] range // then add 0.5 as our final step // // for the record, idf = log((N-n+0.5)/(n+0.5)) in the original paper // but our variant is a bit easier to compute, and has a better (symmetric) range float fLogTotal = logf ( float ( 1+iTotalClamped ) ); fIDF = logf ( float ( iTotalClamped-tWord.m_iDocs+1 ) / float ( tWord.m_iDocs ) ) / ( 2*iQwords*fLogTotal ); } else { // plain variant, idf=log(N/n), as per Sparck-Jones // // idf \in [0, log(N)] // weight \in [0, NumWords*log(N)] // we prescale idfs and get weight in [0, 0.5] range // then add 0.5 as our final step float fLogTotal = logf ( float ( 1+iTotalClamped ) ); fIDF = logf ( float ( iTotalClamped ) / float ( tWord.m_iDocs ) ) / ( 2*iQwords*fLogTotal ); } } tWord.m_fIDF = fIDF; dWords.Add ( &tWord ); } dWords.Sort ( ExtQwordOrderbyQueryPos_t() ); ARRAY_FOREACH ( i, dWords ) { const ExtQword_t * pWord = dWords[i]; pResult->AddStat ( pWord->m_sDictWord, pWord->m_iDocs, pWord->m_iHits, pWord->m_bExpanded ); } if ( bGotDupes ) pRanker->SetTermDupes ( hQwords, iMaxQpos ); pRanker->m_iMaxQpos = iMaxQpos; pRanker->SetQwordsIDF ( hQwords ); if ( !pRanker->InitState ( tCtx, pResult->m_sError ) ) SafeDelete ( pRanker ); return pRanker; } ////////////////////////////////////////////////////////////////////////// /// HIT MARKER ////////////////////////////////////////////////////////////////////////// void CSphHitMarker::Mark ( CSphVector & dMarked ) { if ( !m_pRoot ) return; const ExtHit_t * pHits = NULL; const ExtDoc_t * pDocs = NULL; SphDocID_t uMaxID = 0; pDocs = m_pRoot->GetDocsChunk ( &uMaxID ); if ( !pDocs ) return; for ( ;; ) { pHits = m_pRoot->GetHitsChunk ( pDocs, uMaxID ); if ( !pHits ) break; for ( ; pHits->m_uDocid!=DOCID_MAX; pHits++ ) { SphHitMark_t tMark; tMark.m_uPosition = HITMAN::GetPos ( pHits->m_uHitpos ); tMark.m_uSpan = pHits->m_uMatchlen; dMarked.Add ( tMark ); } } } CSphHitMarker::~CSphHitMarker () { SafeDelete ( m_pRoot ); } CSphHitMarker * CSphHitMarker::Create ( const XQNode_t * pRoot, const ISphQwordSetup & tSetup ) { ExtNode_i * pNode = NULL; if ( pRoot ) pNode = ExtNode_i::Create ( pRoot, tSetup ); if ( !pRoot || pNode ) { CSphHitMarker * pMarker = new CSphHitMarker; pMarker->m_pRoot = pNode; return pMarker; } return NULL; } ////////////////////////////////////////////////////////////////////////// // INTRA-BATCH CACHING ////////////////////////////////////////////////////////////////////////// /// container that does intra-batch query-sub-tree caching /// actually carries the cached data, NOT to be recreated frequently (see thin wrapper below) class NodeCacheContainer_t { private: friend class ExtNodeCached_t; friend class CSphQueryNodeCache; private: int m_iRefCount; bool m_StateOk; const ISphQwordSetup * m_pSetup; CSphVector m_Docs; CSphVector m_Hits; CSphVector m_InlineAttrs; int m_iAtomPos; // minimal position from original donor, used for shifting CSphQueryNodeCache * m_pNodeCache; public: NodeCacheContainer_t () : m_iRefCount ( 1 ) , m_StateOk ( true ) , m_pSetup ( NULL ) , m_iAtomPos ( 0 ) , m_pNodeCache ( NULL ) {} public: void Release() { if ( --m_iRefCount<=0 ) Invalidate(); } ExtNode_i * CreateCachedWrapper ( ExtNode_i* pChild, const XQNode_t * pRawChild, const ISphQwordSetup & tSetup ); private: bool WarmupCache ( ExtNode_i * pChild, int iQWords ); void Invalidate(); }; /// cached node wrapper to be injected into actual search trees /// (special container actually carries all the data and does the work, see blow) class ExtNodeCached_t : public ExtNode_i { friend class NodeCacheContainer_t; NodeCacheContainer_t * m_pNode; ExtDoc_t * m_pHitDoc; ///< points to entry in m_dDocs which GetHitsChunk() currently emits hits for SphDocID_t m_uHitsOverFor; ///< there are no more hits for matches block starting with this ID CSphString * m_pWarning; int64_t m_iMaxTimer; ///< work until this timestamp int m_iHitIndex; ///< store the current position in m_Hits for GetHitsChunk() int m_iDocIndex; ///< store the current position in m_Docs for GetDocsChunk() ExtNode_i * m_pChild; ///< pointer to donor for the sake of AtomPos procession int m_iQwords; ///< number of tokens in parent query void StepForwardToHitsFor ( SphDocID_t uDocId ); // creation possible ONLY via NodeCacheContainer_t explicit ExtNodeCached_t ( NodeCacheContainer_t * pNode, ExtNode_i * pChild ) : m_pNode ( pNode ) , m_pHitDoc ( NULL ) , m_uHitsOverFor ( 0 ) , m_pWarning ( NULL ) , m_iMaxTimer ( 0 ) , m_iHitIndex ( 0 ) , m_iDocIndex ( 0 ) , m_pChild ( pChild ) , m_iQwords ( 0 ) { m_iAtomPos = pChild->m_iAtomPos; } public: virtual ~ExtNodeCached_t () { SafeDelete ( m_pChild ); SafeRelease ( m_pNode ); } virtual void Reset ( const ISphQwordSetup & tSetup ) { if ( m_pChild ) m_pChild->Reset ( tSetup ); m_iHitIndex = 0; m_iDocIndex = 0; m_uHitsOverFor = 0; m_pHitDoc = NULL; m_iMaxTimer = 0; m_iMaxTimer = tSetup.m_iMaxTimer; m_pWarning = tSetup.m_pWarning; } virtual void HintDocid ( SphDocID_t ) {} virtual const ExtDoc_t * GetDocsChunk ( SphDocID_t * pMaxID ); virtual const ExtHit_t * GetHitsChunk ( const ExtDoc_t * pMatched, SphDocID_t uMaxID ); virtual int GetQwords ( ExtQwordsHash_t & hQwords ) { if ( m_pChild ) return m_pChild->GetQwords ( hQwords ); return -1; } virtual void SetQwordsIDF ( const ExtQwordsHash_t & hQwords ) { m_iQwords = hQwords.GetLength(); if ( m_pNode->m_pSetup ) { if ( m_pChild ) m_pChild->SetQwordsIDF ( hQwords ); m_pNode->WarmupCache ( m_pChild, m_iQwords ); } } virtual void GetTermDupes ( const ExtQwordsHash_t & hQwords, CSphVector & dTermDupes ) { if ( m_pChild ) m_pChild->GetTermDupes ( hQwords, dTermDupes ); } virtual bool GotHitless () { return ( m_pChild ) ? m_pChild->GotHitless() : false; } virtual uint64_t GetWordID() const { if ( m_pChild ) return m_pChild->GetWordID(); else return 0; } }; ////////////////////////////////////////////////////////////////////////// ExtNode_i * NodeCacheContainer_t::CreateCachedWrapper ( ExtNode_i * pChild, const XQNode_t * pRawChild, const ISphQwordSetup & tSetup ) { if ( !m_StateOk ) return pChild; // wow! we have a virgin! if ( !m_Docs.GetLength() ) { m_iRefCount = pRawChild->GetCount(); m_pSetup = &tSetup; } return new ExtNodeCached_t ( this, pChild ); } bool NodeCacheContainer_t::WarmupCache ( ExtNode_i * pChild, int iQwords ) { assert ( pChild ); assert ( m_pSetup ); SphDocID_t pMaxID = 0; m_iAtomPos = pChild->m_iAtomPos; const ExtDoc_t * pChunk = pChild->GetDocsChunk ( &pMaxID ); int iStride = 0; if ( pChunk && pChunk->m_pDocinfo ) iStride = pChild->m_iStride; while ( pChunk ) { const ExtDoc_t * pChunkHits = pChunk; bool iHasDocs = false; for ( ; pChunk->m_uDocid!=DOCID_MAX; pChunk++ ) { m_Docs.Add ( *pChunk ); // exclude number or Qwords from FIDF m_Docs.Last().m_fTFIDF *= iQwords; m_pNodeCache->m_iMaxCachedDocs--; if ( iStride>0 ) { // since vector will relocate the data on resize, do NOT fill new m_pDocinfo right now int iLen = m_InlineAttrs.GetLength(); m_InlineAttrs.Resize ( iLen+iStride ); memcpy ( &m_InlineAttrs[iLen], pChunk->m_pDocinfo, iStride*sizeof(CSphRowitem) ); } iHasDocs = true; } const ExtHit_t * pHits = NULL; if ( iHasDocs ) { SphDocID_t uLastDocid = m_Docs.Last().m_uDocid; while ( ( pHits = pChild->GetHitsChunk ( pChunkHits, uLastDocid ) )!=NULL ) { for ( ; pHits->m_uDocid!=DOCID_MAX; pHits++ ) { m_Hits.Add ( *pHits ); m_pNodeCache->m_iMaxCachedHits--; } } } // too many values, stop caching if ( m_pNodeCache->m_iMaxCachedDocs<0 || m_pNodeCache->m_iMaxCachedHits<0 ) { Invalidate (); pChild->Reset ( *m_pSetup ); m_pSetup = NULL; return false; } pChunk = pChild->GetDocsChunk ( &pMaxID ); } if ( iStride ) ARRAY_FOREACH ( i, m_Docs ) m_Docs[i].m_pDocinfo = &m_InlineAttrs[i*iStride]; m_Docs.Add().m_uDocid = DOCID_MAX; m_Hits.Add().m_uDocid = DOCID_MAX; pChild->Reset ( *m_pSetup ); m_pSetup = NULL; return true; } void NodeCacheContainer_t::Invalidate() { m_pNodeCache->m_iMaxCachedDocs += m_Docs.GetLength(); m_pNodeCache->m_iMaxCachedHits += m_Docs.GetLength(); m_Docs.Reset(); m_InlineAttrs.Reset(); m_Hits.Reset(); m_StateOk = false; } ////////////////////////////////////////////////////////////////////////// void ExtNodeCached_t::StepForwardToHitsFor ( SphDocID_t uDocId ) { assert ( m_pNode ); assert ( m_pNode->m_StateOk ); CSphVector & dHits = m_pNode->m_Hits; int iEnd = dHits.GetLength()-1; if ( m_iHitIndex>=iEnd ) return; if ( dHits[m_iHitIndex].m_uDocid==uDocId ) return; // binary search for lower (most left) bound of the subset of values int iHitIndex = m_iHitIndex; // http://blog.gamedeff.com/?p=12 while ( iEnd-iHitIndex>1 ) { if ( uDocIddHits[iEnd].m_uDocid ) { m_iHitIndex = -1; return; } int iMid = iHitIndex + (iEnd-iHitIndex)/2; if ( dHits[iMid].m_uDocid>=uDocId ) iEnd = iMid; else iHitIndex = iMid; } m_iHitIndex = iEnd; } const ExtDoc_t * ExtNodeCached_t::GetDocsChunk ( SphDocID_t * pMaxID ) { if ( !m_pNode || !m_pChild ) return NULL; if ( !m_pNode->m_StateOk ) return m_pChild->GetDocsChunk ( pMaxID ); if ( m_iMaxTimer>0 && sphMicroTimer()>=m_iMaxTimer ) { if ( m_pWarning ) *m_pWarning = "query time exceeded max_query_time"; return NULL; } m_uMaxID = 0; int iDoc = Min ( m_iDocIndex+MAX_DOCS-1, m_pNode->m_Docs.GetLength()-1 ) - m_iDocIndex; memcpy ( &m_dDocs[0], &m_pNode->m_Docs[m_iDocIndex], sizeof(ExtDoc_t)*iDoc ); m_iDocIndex += iDoc; // funny trick based on the formula of FIDF calculation. for ( int i=0; im_StateOk ) return m_pChild->GetHitsChunk ( pMatched, uMaxID ); if ( !pMatched ) return NULL; SphDocID_t uFirstMatch = pMatched->m_uDocid; // aim to the right document ExtDoc_t * pDoc = m_pHitDoc; m_pHitDoc = NULL; if ( !pDoc ) { // if we already emitted hits for this matches block, do not do that again if ( uFirstMatch==m_uHitsOverFor ) return NULL; // early reject whole block if ( pMatched->m_uDocid > m_uMaxID ) return NULL; if ( m_uMaxID && m_dDocs[0].m_uDocid > uMaxID ) return NULL; // find match pDoc = m_dDocs; do { while ( pDoc->m_uDocid < pMatched->m_uDocid ) pDoc++; if ( pDoc->m_uDocid==DOCID_MAX ) { m_uHitsOverFor = uFirstMatch; return NULL; // matched docs block is over for me, gimme another one } while ( pMatched->m_uDocid < pDoc->m_uDocid ) pMatched++; if ( pMatched->m_uDocid==DOCID_MAX ) { m_uHitsOverFor = uFirstMatch; return NULL; // matched doc block did not yet begin for me, gimme another one } } while ( pDoc->m_uDocid!=pMatched->m_uDocid ); // setup hitlist reader StepForwardToHitsFor ( pDoc->m_uDocid ); } // hit emission int iHit = 0; while ( iHitm_Hits[m_iHitIndex]; if ( tCachedHit.m_uDocid==DOCID_MAX ) break; if ( tCachedHit.m_uDocid!=pDoc->m_uDocid ) { // no more hits; get next acceptable document pDoc++; do { while ( pDoc->m_uDocid < pMatched->m_uDocid ) pDoc++; if ( pDoc->m_uDocid==DOCID_MAX ) { pDoc = NULL; break; } // matched docs block is over for me, gimme another one while ( pMatched->m_uDocid < pDoc->m_uDocid ) pMatched++; if ( pMatched->m_uDocid==DOCID_MAX ) { pDoc = NULL; break; } // matched doc block did not yet begin for me, gimme another one } while ( pDoc->m_uDocid!=pMatched->m_uDocid ); if ( !pDoc ) break; assert ( pDoc->m_uDocid==pMatched->m_uDocid ); // setup hitlist reader StepForwardToHitsFor ( pDoc->m_uDocid ); continue; } m_iHitIndex++; m_dHits[iHit] = tCachedHit; m_dHits[iHit].m_uQuerypos = (WORD)( m_dHits[iHit].m_uQuerypos + m_iAtomPos - m_pNode->m_iAtomPos ); iHit++; } m_pHitDoc = pDoc; if ( iHit==0 || iHit=0 && iHit0 && iMaxCachedHits>0 && iMaxCachedDocs>0 ) { m_pPool = new NodeCacheContainer_t [ iCells ]; for ( int i=0; iGetOrder()>=0 ); return m_pPool [ pRawChild->GetOrder() ].CreateCachedWrapper ( pChild, pRawChild, tSetup ); } // // $Id: sphinxsearch.cpp 4664 2014-04-18 18:08:22Z tomat $ //