using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
namespace ConsoleApplication1
{
class Term
{
public string word;//定义词项
public int doc_freq;//定义文档频率
}
class invertedIndex //建立倒排索引的类
{
Dictionary> index = new Dictionary>();//初始化倒排表
Dictionary mapping = new Dictionary();//初始化文件路径和文档编号的映射
public invertedIndex(string path)//路径
{
init(path);//提出路径
}
void init(string path)
{
string[] files = Directory.GetFiles(path);//获取文件
foreach (string file in files)//依次读取文件
{
int doc_id = mapping.Count + 1;
mapping.Add(doc_id, file);//文档映射添加到词项
StreamReader sr = new StreamReader(file);//重载文件
string content = sr.ReadToEnd();//读取文档
string[] words = content.Split(new char[] { ' ', ',', ';', '.', '!' }, StringSplitOptions.RemoveEmptyEntries);
foreach (string word in words)//遍历文档中的每一个词
{
Term term = index.Keys.FirstOrDefault(m => m.word == word);//判断文档中有没有那个词项
if (term == null)//如果没有那个词项
{
term = new Term();
term.word = word;
term.doc_freq = 1;
List<int> posting = new List<int>();
posting.Add(doc_id);
index.Add(term, posting);//文档中没有,直接添加,文档频率1,文档编号挂到倒排表
}
else//如果有那个词项,什么都不做
{
List<int> posting = index[term];
if (!posting.Contains(doc_id)) //如果没有那个词项,把文档编号挂到倒排表,文档频率+1
{
posting.Add(doc_id);
index.Remove(term);
term.doc_freq++;
index.Add(term, posting);
}
}
}
}
}
public void output()//输出
{
foreach (var term in index.Keys)
{
Console.WriteLine(term + "," + index[term]);
}
}
}
}