Most previous work on the recently developed language-modeling approach to information retrieval focuses on document-specific characteristics, and therefore does not take into account the structure of the surrounding corpus. We propose a novel algorithmic framework in which information provided by document-based language models is enhanced by the incorporation of information drawn from clusters of similar documents. Using this framework, we develop a suite of new algorithms. Even the simplest typically outperforms the standard language-modeling approach in precision and recall, and our new interpolation algorithm posts statistically significant improvements for both metrics over all three corpora tested.
@inproceedings{Kurland+Lee:04a, author = {Oren Kurland and Lillian Lee}, title = {Corpus structure, language models, and ad hoc information retrieval}, year = {2004}, pages = {194--201}, booktitle = {Proceedings of SIGIR} } @article{Kurland:2009:CLM:1508850.1508851, author = {Kurland, Oren and Lee, Lillian}, title = {Clusters, Language Models, and Ad Hoc Information Retrieval}, journal = {ACM Transactions on Information Systems}, volume = {27}, number = {3}, year = {2009}, pages = {13:1--13:39}, doi = {10.1145/1508850.1508851}, }