@inproceedings{4d9ef449512b4ea9a684781441a4482a,
title = "Statistical Approach for Term Weighting in Very Short Documents for Text Categorization",
abstract = "In this paper, we propose a novel approach for term weighting in very short documents that is used with a Support Vector Machine classifier. We focus on market research and social media documents. In both of these data sources, the average length of a document is below twenty words. As the documents are short, each word occurs usually only once within a document. This is known as hapax legomenon and in our previous work as Term Frequency=1 challenge. For this reason, the traditional term weighting approaches become less effective with short documents. In this paper we propose a novel approach for term weighting that does not use term frequency within a document butsubstitutes it with other word statistics. In the experimental evaluation and comparison against several other term weighting approaches the proposed method produced promising results by out-performing the competition.",
keywords = "feature weighting, hapax legomenon, short documetn categorization, support vector machine, text categorization",
author = "Mika Timonen and Melissa Kasari",
year = "2013",
doi = "10.1007/978-3-642-54105-6_1",
language = "English",
isbn = "978-3-642-54104-9",
series = "Communications in Computer and Information Science",
publisher = "Springer",
pages = "3--18",
editor = "Ana Fred and Dietz, {Jan L.G.} and Kecheng Liu and Joaquim Filipe",
booktitle = "Knowledge Discover, Knowledge Engineering and Knowledge Management",
address = "Germany",
note = "4th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management, IC3K 2012, IC3K 2012 ; Conference date: 04-10-2012 Through 07-10-2012",
}