现在已经对一些文档求出了倒排索引,对于一些词得出了这些词在哪些文档中出现的列表。
要求对于倒排索引实现一些简单的查询,即查询某些词同时出现,或者有些词出现有些词不出现的文档有哪些。
3 3 1 2 3 1 2 1 3 3 1 1 1 1 -1 0 1 -1 -1
NOT FOUND 1 3 1
#include <iostream>
#include <list>
#include <algorithm>
#include <map>
#include <fstream>
using namespace std;
class Node
{
public:
int word_id;
int bit;
};
int compare(const void * a,const void * b)
{
return *(int*)a - *(int *)b;
}
void print_list(list<int> mylist);
list<Node> initialize_list(int **search,int row,int N,bool need);
list<int> sub_data(list<int> wanted_list,list<int> not_wanted_list);
list<int> find_same_need(list<Node> mylist,int ** data,const int length);
list<int> find_same_not_need(list<Node> mylist,int ** data,const int length);
void read_data(int ** &data,int ** & search,int &M,int &N);
int main()
{
int ** data;
int ** search;
int M,N;
list<int> wanted_list;
list<int> not_wanted_list;
list<Node> wanted_node_list;
list<Node> not_wanted_node_list;
list<int> result;
read_data(data,search,M,N);
for( int i=0;i<M;i++ )
{
wanted_node_list = initialize_list(search,i,N,true);
not_wanted_node_list = initialize_list(search,i,N,false);
wanted_list = find_same_need(wanted_node_list,data,N);
not_wanted_list = find_same_not_need(not_wanted_node_list,data,N);
result = sub_data(wanted_list,not_wanted_list);
print_list(result);
}
system("pause");
return 0;
}
list<int> find_same_need(list<Node> mylist,int ** data,const int length)
{
list<int> result;
bool same ;
list<Node>::iterator min_node;
int min_article_id;
while( true )
{
// 1. 检查当前文档号是否是一样的
same = true;
Node one = mylist.front();
for(list<Node>::iterator node = mylist.begin();node != mylist.end(); node++)
{
if( data[one.word_id][one.bit] != data[node->word_id][node->bit] )
{
same = false;
break;
}
}
// 2. 结果如果不相同时,则需要增加最小的
if( ! same )
{
min_article_id = INT_MAX;
for(list<Node>::iterator node = mylist.begin();node != mylist.end(); node++)
{
if( data[node->word_id][node->bit] < min_article_id )
{
min_article_id = data[node->word_id][node->bit];
min_node = node;
}
}
min_node->bit ++;
// 已经得到所有结果
if( min_node->bit > data[min_node->word_id][0] )
return result;
}
// 3. 存储一个结果
else
{
result.push_back( mylist.front().bit );
for (list<Node>::iterator node = mylist.begin(); node != mylist.end(); node++)
{
node->bit ++;
// 已经得到所有结果
if( node->bit > data[node->word_id][0] )
return result;
}
}
}
}
list<int> find_same_not_need(list<Node> mylist,int ** data,const int length)
{
map<int,bool> myamp;
list<int> result;
for(list<Node>::iterator it = mylist.begin();it != mylist.end() ;it++)
{
while(it->bit <= data[it->word_id][0])
{
if( myamp.find(data[it->word_id][it->bit]) == myamp.end() )
{
myamp.insert(make_pair(data[it->word_id][it->bit],true));
}
it->bit ++;
}
}
for(map<int,bool>::iterator it = myamp.begin();it!=myamp.end();it++)
{
result.push_back(it->first);
}
return result;
}
list<int> sub_data(list<int> wanted_list,list<int> not_wanted_list)
{
list<int> result;
list<int>::iterator wanted_it,not_wanted_it;
wanted_it = wanted_list.begin();
not_wanted_it = not_wanted_list.begin();
while( wanted_it!= wanted_list.end() && not_wanted_it != not_wanted_list.end() )
{
if( *wanted_it < *not_wanted_it )
{
result.push_back(*wanted_it) ;
wanted_it++;
}
else if( *wanted_it > *not_wanted_it )
{
not_wanted_it++;
}
else
{
wanted_it++;
not_wanted_it++;
}
}
while(wanted_it != wanted_list.end())
{
result.push_back(*wanted_it);
wanted_it++;
}
return result;
}
list<Node> initialize_list(int **search,int row,int N,bool need)
{
list<Node> result;
Node node;
if( need )
{
for( int i=0;i<N;i++ )
{
if( search[row][i] == 1 )
{
node.word_id = i;
node.bit = 1;
result.push_back( node );
}
}
}
else
{
for( int i=0;i<N;i++ )
{
if( search[row][i] == -1 )
{
node.word_id = i;
node.bit = 1;
result.push_back( node );
}
}
}
return result;
}
void print_list(list<int> mylist)
{
if(mylist.size() > 0)
{
for(list<int>::iterator it=mylist.begin();it != mylist.end();it++)
{
cout<<*it<<" ";
}
}
else cout<<"NOT FOUND";
cout<<endl;
}
void read_data(int ** &data,int ** & search,int &M,int &N)
{
ifstream reader;
reader.open("data.txt");
reader>>N;
data = new int* [N];
int m;
for(int i=0;i<N;i++)
{
reader>>m;
data[i] = new int[m+1];
data[i][0] = m;
for(int j=1;j<m+1;j++)
{
reader>>data[i][j];
}
qsort(data[i]+1,data[i][0],sizeof(int),compare);
}
reader>>M;
search = new int * [M];
for(int i=0;i<M;i++)
{
search[i] = new int[N];
for(int j =0;j<N;j++)
{
reader>>search[i][j];
}
}
reader.close();
}3 3 1 2 3 1 2 1 3 3 1 1 1 1 -1 0 1 -1 -1
原文地址:http://blog.csdn.net/cqs_experiment/article/details/40374987