library.bib

Automatically generated by Mendeley Desktop 1.16.1
Any changes to this file will be lost if it is regenerated by Mendeley.

BibTeX export options can be customized via Options -> BibTeX in Mendeley Desktop

@article{Gupta2013,
abstract = {In the statistics community, outlier detection for time series data has been studied for decades. Recently, with advances in hardware and software technology, there has been a large body of work on temporal outlier detection from a computational perspective within the computer science community. In particular, advances in hardware technology have enabled the availability of various forms of temporal data collection mechanisms, and advances in software technology have enabled a variety of data management mechanisms. This has fueled the growth of different kinds of data sets such as data streams, spatio- temporal data, distributed streams, temporal networks, and time series data, generated by a multitude of applications. There arises a need for an organized and detailed study of the work done in the area of outlier detection with respect to such temporal datasets. In this survey, we provide a comprehensive and structured overview of a large set of interesting outlier definitions for various forms of temporal data, novel techniques, and application scenarios in which specific definitions and techniques have been widely used.},
author = {Gupta, Manish and Gao, Jing and Aggarwal, Charu C. and Han, Jiawei},
doi = {10.1109/TKDE.2013.184},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Outlier Detection for Temporal Data A Survey.pdf:pdf},
isbn = {9781627053754},
issn = {1041-4347},
journal = {Ieee Transactions on Knowledge and Data Engineering},
keywords = {Computational modeling,Data mining,Distributed databases,Hidden Markov models,Mining methods and algorithms,Pattern matching,Predictive models,Temporal outlier detection,Time series analysis,applications of temporal outlier detection,computer science community,data handling,data management mechanisms,data streams,distributed data streams,distributed streams,hardware technology,network outliers,software technology,spatio-temporal data,spatio-temporal outliers,statistics community,temporal data collection mechanisms,temporal datasets,temporal networks,temporal outlier detection,time series,time series data},
number = {1},
pages = {1--20},
title = {{Outlier Detection for Temporal Data : A Survey}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=6684530$\backslash$nhttp://www.morganclaypool.com/doi/abs/10.2200/S00573ED1V01Y201403DMK008 http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=6684530},
volume = {25},
year = {2013}
}
@article{Aggarwal2013,
author = {Aggarwal, CC},
doi = {10.1007/978-},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/High-dimensional Outlier Detection Survey.pdf:pdf},
journal = {Outlier Analysis},
title = {{High-Dimensional Outlier Detection: The Subspace Method}},
url = {http://link.springer.com/chapter/10.1007/978-1-4614-6396-2{\_}5},
year = {2013}
}
@article{Bengio2012,
abstract = {After a more than decade-long period of relatively little research activity in the area of recurrent neural networks, several new developments will be reviewed here that have allowed substantial progress both in understanding and in technical solutions towards more efficient training of recurrent networks. These advances have been motivated by and related to the optimization issues surrounding deep learning. Although recurrent networks are extremely powerful in what they can in principle represent in terms of modelling sequences,their training is plagued by two aspects of the same issue regarding the learning of long-term dependencies. Experiments reported here evaluate the use of clipping gradients, spanning longer time ranges with leaky integration, advanced momentum techniques, using more powerful output probability models, and encouraging sparser gradients to help symmetry breaking and credit assignment. The experiments are performed on text and music data and show off the combined effects of these techniques in generally improving both training and test error.},
archivePrefix = {arXiv},
arxivId = {1212.0901},
author = {Bengio, Yoshua and Boulanger-Lewandowski, Nicolas and Pascanu, Razvan},
eprint = {1212.0901},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/ADVANCES IN OPTIMIZING RECURRENT NETWORKS.pdf:pdf},
title = {{Advances in Optimizing Recurrent Networks}},
url = {http://arxiv.org/abs/1212.0901},
year = {2012}
}
@article{mahoney2005trajectory,
author = {Mahoney, Matthew V and Chan, Philip K},
title = {{Trajectory boundary modeling of time series for anomaly detection}},
year = {2005}
}
@article{Ngkvist2014a,
abstract = {PATTERN RECOGNITION LETTERS, 42 (2014) 11-24. doi:10.1016/j.patrec.2014.01.008},
author = {Ngkvist, Martin L {\~{A}} and Karlsson, Lars and Loutfi, Amy},
doi = {10.1016/j.patrec.2014.01.008},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/A review of unsupervised feature learning and deep learning for time-series modeling.pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
number = {C},
pages = {11--24},
title = {{A review of unsupervised feature learning and deep learning for time-series modeling}},
url = {http://dx.doi.org/10.1016/j.patrec.2014.01.008$\backslash$npapers3://publication/doi/10.1016/j.patrec.2014.01.008},
volume = {42},
year = {2014}
}
@article{Otey2006,
abstract = {Efficiently detecting outliers or anomalies is an important problem in many areas of science, medicine and information technology. Applications range from data cleaning to clinical diagnosis, from detecting anomalous defects in materials to fraud and intrusion detection. Over the past decade, researchers in data mining and statistics have addressed the problem of outlier detection using both parametric and non-parametric approaches in a centralized setting. However, there are still several challenges that must be addressed. First, most approaches to date have focused on detecting outliers in a continuous attribute space. However, almost all real-world data sets contain a mixture of categorical and continuous attributes. Categorical attributes are typically ignored or incorrectly modeled by existing approaches, resulting in a significant loss of information. Second, there have not been any general-purpose distributed outlier detection algorithms. Most distributed detection algorithms are designed with a specific domain (e.g. sensor networks) in mind. Third, the data sets being analyzed may be streaming or otherwise dynamic in nature. Such data sets are prone to concept drift, and models of the data must be dynamic as well. To address these challenges, we present a tunable algorithm for distributed outlier detection in dynamic mixed-attribute data sets.},
author = {Otey, Matthew Eric and Ghoting, Amol and Parthasarathy, Srinivasan},
doi = {10.1007/s10618-005-0014-6},
issn = {1384-5810},
journal = {Data Mining and Knowledge Discovery},
number = {2-3},
pages = {203--228},
title = {{Fast Distributed Outlier Detection in Mixed-Attribute Data Sets}},
url = {http://www.springerlink.com/index/10.1007/s10618-005-0014-6},
volume = {12},
year = {2006}
}
@article{Chan2005,
abstract = {Our goal is to generate comprehensible and accurate models from multiple time series for anomaly detection. The models need to produce anomaly scores in an online man- ner for real-life monitoring tasks. We introduce three algo- rithms that work in a constructed feature space and evaluate them with a real data set from the NASA shuttle program. Our offline and online evaluations indicate that our algo- rithms can be more accurate than two existing algorithms. 1.},
author = {Chan, Philip K. and Mahoney, Matthew V.},
doi = {10.1109/ICDM.2005.101},
isbn = {0-7695-2278-5},
issn = {15504786},
journal = {IEEE International Conference on Data Mining},
pages = {90--97},
title = {{Modeling multiple time series for anomaly detection}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=1565666$\backslash$nhttp://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=1565666},
year = {2005}
}
@article{Ye2000,
abstract = {This paper presents an anomaly detection technique to detect intrusions$\backslash$ninto computer and network systems. In this technique, a Markov chain$\backslash$nmodel is used to represent a temporal profile of normal behavior$\backslash$nin a computer and network system. The Markov chain model of the norm$\backslash$nprofile is learned from historic data of the system's normal behavior.$\backslash$nThe observed behavior of the system is analyzed to infer the probability$\backslash$nthat the Markov chain model of the norm profile supports the observed$\backslash$nbehavior. A low probability of support indicates an anomalous behavior$\backslash$nthat may result from intrusive activities. The technique was implemented$\backslash$nand tested on the audit data of a Sun Solaris system. The testing$\backslash$nresults showed that the technique clearly distinguished intrusive$\backslash$nactivities from normal activities in the testing data.},
author = {Ye, N},
journal = {Proceedings of the 2000 IEEE Systems, Man, and Cybernetics Information Assurance and Security Workshop},
keywords = {anomaly,anomaly detection,anomaly{\_}detection,detection,intrusion detection,iros,markov chain,temporal behaviour},
number = {4},
pages = {171--174},
title = {{A markov chain model of temporal behavior for anomaly detection}},
year = {2000}
}
@article{Vinyals2015,
abstract = {Deep recurrent architecture에 기반하여 image의 natural sentences 를 describing 하는 방법 소개},
archivePrefix = {arXiv},
arxivId = {arXiv:1411.4555v2},
author = {Vinyals, Oriol and Toshev, Alexander},
eprint = {arXiv:1411.4555v2},
isbn = {9781467369640},
journal = {Cvpr},
title = {{Show and Tell : A Neural Image Caption Generator}},
year = {2015}
}
@article{Sutskever2013a,
abstract = {Deep and recurrent neural networks (DNNs and RNNs respectively) are powerful models that were considered to be almost impossible to train using stochastic gradient descent with momentum. In this paper, we show that when stochastic gradient descent with momentum uses a well-designed random initialization and a particular type of slowly increasing schedule for the momentum parameter, it can train both DNNs and RNNs (on datasets with long-term dependencies) to levels of performance that were previously achievable only with Hessian-Free optimization. We find that both the initialization and the momentum are crucial since poorly initialized networks cannot be trained with momentum and well-initialized networks perform markedly worse when the momentum is absent or poorly tuned. Our success training these models suggests that previous attempts to train deep and recurrent neural networks from random initializations have likely failed due to poor initialization schemes. Furthermore, carefully tuned momentum methods suffice for dealing with the curvature issues in deep and recurrent network training objectives without the need for sophisticated second-order methods.},
author = {Sutskever, Ilya and Martens, James and Dahl, George and Hinton, Geoffrey},
doi = {10.1109/ICASSP.2013.6639346},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/On the importance of initialization and momentum in deep learning.pdf:pdf;:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/On the importance of initialization and momentum in deep learning.pdf:pdf},
isbn = {978-1-4799-0356-6},
issn = {15206149},
journal = {Jmlr W{\&}Cp},
number = {2010},
pages = {1139--1147},
title = {{On the importance of initialization and momentum in deep learning}},
volume = {28},
year = {2013}
}
@article{Wei2006,
abstract = {Over the past three decades, there has been a great deal of research on shape analysis, focusing mostly on shape indexing, clustering, and classification. In this work, we introduce the new problem of finding shape discords, the most unusual shapes in a collection. We motivate the problem by considering the utility of shape discords in diverse domains including zoology, anthropology, and medicine. While the brute force search algorithm has quadratic time complexity, we avoid this by using locality-sensitive hashing to estimate similarity between shapes which enables us to reorder the search more efficiently. An extensive experimental evaluation demonstrates that our approach can speed up computation by three to four orders of magnitude.},
author = {Wei, Li and Keogh, Eamonn and Xi, Aopeng},
doi = {10.1109/ICDM.2006.138},
isbn = {0769527019},
issn = {15504786},
journal = {Proceedings - IEEE International Conference on Data Mining, ICDM},
keywords = {anomaly detection,shape},
pages = {711--720},
title = {{SAXually explicit images: Finding unusual shapes}},
year = {2006}
}
@article{Erfani2014,
author = {Erfani, Sarah M. and Law, Yee Wei and Karunasekera, Shanika and Leckie, Christopher a. and Palaniswami, Marimuthu},
doi = {10.1007/978-3-319-06608-0_48},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Privacy-Preserving Collaborative Anomaly Detection for Participatory Sensing.pdf:pdf;:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Privacy-Preserving Collaborative Anomaly Detection for Participatory Sensing.pdf:pdf},
issn = {16113349},
journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
keywords = {Anomaly detection,Collaborative learning,Horizontally partitioned data,Participatory sensing,Privacy-preserving data mining},
number = {PART 1},
pages = {581--593},
title = {{Privacy-preserving collaborative anomaly detection for participatory sensing}},
volume = {8443 LNAI},
year = {2014}
}
@article{Laptev2015,
author = {Laptev, Nikolay and Flint, Ian},
doi = {10.1145/2783258.2788611},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Generic and Scalable Framework for Automated Time-Series Anomaly Detection.pdf:pdf},
isbn = {9781450336642},
journal = {Kdd},
title = {{Generic and Scalable Framework for Automated Time-series Anomaly Detection}},
year = {2015}
}
@article{Bengio2012a,
abstract = {After a more than decade-long period of relatively little research activity in the area of recurrent neural networks, several new developments will be reviewed here that have allowed substantial progress both in understanding and in technical solutions towards more efficient training of recurrent networks. These advances have been motivated by and related to the optimization issues surrounding deep learning. Although recurrent networks are extremely powerful in what they can in principle represent in terms of modelling sequences,their training is plagued by two aspects of the same issue regarding the learning of long-term dependencies. Experiments reported here evaluate the use of clipping gradients, spanning longer time ranges with leaky integration, advanced momentum techniques, using more powerful output probability models, and encouraging sparser gradients to help symmetry breaking and credit assignment. The experiments are performed on text and music data and show off the combined effects of these techniques in generally improving both training and test error.},
archivePrefix = {arXiv},
arxivId = {1212.0901},
author = {Bengio, Yoshua and Boulanger-Lewandowski, Nicolas and Pascanu, Razvan},
eprint = {1212.0901},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/ADVANCES IN OPTIMIZING RECURRENT NETWORKS.pdf:pdf},
title = {{Advances in Optimizing Recurrent Networks}},
url = {http://arxiv.org/abs/1212.0901},
year = {2012}
}
@inproceedings{Ge2010,
abstract = {The increasing availability of large-scale location traces creates unprecedent opportunities to change the paradigm for identifying abnormal moving activities. Indeed, various aspects of abnormality of moving patterns have recently been exploited, such as wrong direction and wandering. However, there is no recognized way of combining different aspects into an unified evolving abnormality score which has the ability to capture the evolving nature of abnormal moving trajectories. To that end, in this paper, we provide an evolving trajectory outlier detection method, named TOP-EYE, which continuously computes the outlying score for each trajectory in an accumulating way. Specifically, in TOP-EYE, we introduce a decay function to mitigate the influence of the past trajectories on the evolving outlying score, which is defined based on the evolving moving direction and density of trajectories. This decay function enables the evolving computation of accumulated outlying scores along the trajectories. An advantage of TOP-EYE is to identify evolving outliers at very early stage with relatively low false alarm rate. Finally, experimental results on real-world location traces show that TOP-EYE can effectively capture evolving abnormal trajectories.},
author = {Ge, Yong and Xiong, Hui and Zhou, Zhi-hua and Ozdemir, Hasan and Yu, Jannite and Lee, K C},
booktitle = {Proceedings of the 19th ACM international conference on Information and knowledge management},
doi = {10.1145/1871437.1871716},
isbn = {9781450300995},
keywords = {outlier},
pages = {1733--1736},
title = {{TOP-EYE : Top- k Evolving Trajectory Outlier Detection}},
year = {2010}
}
@article{Ester1996,
abstract = {Data clustering has become an important task for discovering significant patterns and characteristics in large spatial databases. The Multi-Centroid, Multi-Run Sampling Scheme (MCMRS) has been shown to be effective in improving the k-medoids-based clustering algorithms in our previous work. In this paper, a more advanced sampling scheme termed the Incremental (IMCMRS) is proposed for k-medoids-based clustering algorithms. Experimental results demonstrate the proposed scheme can not only reduce...},
archivePrefix = {arXiv},
arxivId = {10.1.1.71.1980},
author = {Ester, Martin and Kriegel, Hans P and Sander, Jorg and Xu, Xiaowei},
doi = {10.1.1.71.1980},
eprint = {10.1.1.71.1980},
isbn = {1577350049},
issn = {09758887},
journal = {Second International Conference on Knowledge Discovery and Data Mining},
keywords = {arbitrary shape of clus-,clustering algorithms,databases,efficiency on large spatial,handling noise,ters},
pages = {226--231},
title = {{A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise}},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.20.2930},
year = {1996}
}
@article{Bayer2014a,
abstract = {Leveraging advances in variational inference, we propose to enhance recurrent neural networks with latent variables, resulting in Stochastic Recurrent Networks (STORNs). The model i) can be trained with stochastic gradient methods, ii) allows structured and multi-modal conditionals at each time step, iii) features a reliable estimator of the marginal likelihood and iv) is a generalisation of deterministic recurrent neural networks. We evaluate the method on four polyphonic musical data sets and motion capture data.},
archivePrefix = {arXiv},
arxivId = {1411.7610},
author = {Bayer, Justin and Osendorfer, Christian},
eprint = {1411.7610},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/LEARNING STOCHASTIC RECURRENT NETWORKS.pdf:pdf},
pages = {1--9},
title = {{Learning Stochastic Recurrent Networks}},
url = {http://arxiv.org/abs/1411.7610},
year = {2014}
}
@article{Venugopalan2014,
abstract = {Solving the visual symbol grounding problem has long been a goal of artificial intelligence. The field appears to be advancing closer to this goal with recent breakthroughs in deep learning for natural language grounding in static images. In this paper, we propose to translate videos directly to sentences using a unified deep neural network with both convolutional and recurrent structure. Described video datasets are scarce, and most existing methods have been applied to toy domains with a small vocabulary of possible words. By transferring knowledge from 1.2M+ images with category labels and 100,000+ images with captions, our method is able to create sentence descriptions of open-domain videos with large vocabularies. We compare our approach with recent work using language generation metrics, subject, verb, and object prediction accuracy, and a human evaluation.},
archivePrefix = {arXiv},
arxivId = {1412.4729},
author = {Venugopalan, Subhashini and Xu, Huijuan and Donahue, Jeff and Rohrbach, Marcus and Mooney, Raymond and Saenko, Kate},
eprint = {1412.4729},
journal = {arXiv2014},
title = {{Translating Videos to Natural Language Using Deep Recurrent Neural Networks}},
url = {http://arxiv.org/abs/1412.4729},
year = {2014}
}
@inproceedings{Zhang2003,
abstract = {The state transition, which is hidden in the hidden Markov model (HMM), can be used to characterize the intrinsic difference between normal action and intrusion behavior. So HMM is an efficient way to detect anomalies. A new anomaly detection method based on a hierarchical HMM is proposed based on the concept of normal database and abnormal database. It is shown by analysis and simulation results that the proposed method is effective to increase the accuracy of anomaly detection.},
author = {Zhang, Xiaoqiang and Fan, Pingzhi and Zhu, Zhongliang},
booktitle = {Proceedings of the 8th International Scientific and Practical Conference of Students, Post-graduates and Young Scientists. Modern Technique and Technologies. MTT'2002 (Cat. No.02EX550)},
doi = {10.1109/PDCAT.2003.1236299},
isbn = {0-7803-7840-7},
keywords = {Analytical models,Data mining,Databases,Hidden Markov models,IDS,Intrusion detection,Neural networks,Pattern recognition,Power system modeling,Support vector machines,Viterbi algorithm,abnormal database,alarm systems,anomaly detection method,authorisation,database management systems,hidden Markov model,hidden Markov models,hierarchical HMM,intrusion behavior,intrusion detection system,normal database,safety systems,state transition},
pages = {249--252},
title = {{A new anomaly detection method based on hierarchical HMM}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=1236299},
year = {2003}
}
@inproceedings{Jozefowicz2015,
abstract = {The Recurrent Neural Network (RNN) is an extremely powerful sequence model that is often difficult to train. The Long Short-Term Memory (LSTM) is a specific RNN architecture whose design makes it much easier to train. While wildly successful in practice, the LSTM's architecture appears to be ad-hoc so it is not clear if it is optimal, and the significance of its individual components is unclear. In this work, we aim to determine whether the LSTM architecture is optimal or whether much better architectures exist. We conducted a thorough architecture search where we evaluated over ten thousand different RNN architectures, and identified an architecture that outperforms both the LSTM and the recently-introduced Gated Recurrent Unit (GRU) on some but not all tasks. We found that adding a bias of 1 to the LSTM's forget gate closes the gap between the LSTM and the GRU.},
author = {Jozefowicz, Rafal and Zaremba, Wojciech and Sutskever, Ilya},
booktitle = {Proceedings of The 32nd International Conference on Machine Learning},
pages = {2342--2350},
title = {{An Empirical Exploration of Recurrent Network Architectures}},
url = {http://jmlr.org/proceedings/papers/v37/jozefowicz15.html},
year = {2015}
}
@article{Schubert2014a,
abstract = {Outlier detection research has been seeing many new algorithms every year that often appear to be only slightly different from existing methods along with some experiments that show them to "clearly outperform" the others. However, few approaches come along with a clear analysis of existing methods and a solid theoretical differentiation. Here, we provide a formalized method of analysis to allow for a theoretical comparison and generalization of many existing methods. Our unified view improves understanding of the shared properties and of the differences of outlier detection models. By abstracting the notion of locality from the classic distance-based notion, our framework facilitates the construction of abstract methods for many special data types that are usually handled with specialized algorithms. In particular, spatial neighborhood can be seen as a special case of locality. Here we therefore compare and generalize approaches to spatial outlier detection in a detailed manner. We also discuss temporal data like video streams, or graph data such as community networks. Since we reproduce results of specialized approaches with our general framework, and even improve upon them, our framework provides reasonable baselines to evaluate the true merits of specialized approaches. At the same time, seeing spatial outlier detection as a special case of local outlier detection, opens up new potentials for analysis and advancement of methods. {\textcopyright} 2012 The Author(s).},
author = {Schubert, Erich and Zimek, Arthur and Kriegel, Hans Peter},
doi = {10.1007/s10618-012-0300-z},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Local outlier detection reconsidered a generalized view on locality with applications to spatial, video, and network outlier detection.pdf:pdf},
isbn = {1932-1872},
issn = {13845810},
journal = {Data Mining and Knowledge Discovery},
keywords = {Local outlier,Network outlier,Spatial outlier,Video outlier},
number = {1},
pages = {190--237},
title = {{Local outlier detection reconsidered: A generalized view on locality with applications to spatial, video, and network outlier detection}},
volume = {28},
year = {2014}
}
@article{Hochreiter1997,
abstract = {Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms.},
author = {Hochreiter, Sepp and Schmidhuber, J{\"{u}}rgen},
doi = {10.1162/neco.1997.9.8.1735},
isbn = {08997667 (ISSN)},
issn = {0899-7667},
journal = {Neural Computation},
number = {8},
pages = {1735--1780},
pmid = {9377276},
title = {{Long Short-Term Memory}},
volume = {9},
year = {1997}
}
@inproceedings{Malhotra2015,
abstract = {Long Short Term Memory (LSTM) networks have been demonstrated to be particularly useful for learning sequences containing longer term patterns of unknown length, due to their ability to maintain long term memory. Stacking recurrent hidden layers in such networks also enables the learning of higher level temporal features, for faster learning with sparser representations. In this paper, we use stacked LSTM net- works for anomaly/fault detection in time series. A network is trained on non-anomalous data and used as a predictor over a number of time steps. The resulting prediction errors are modeled as a multivariate Gaussian distribution, which is used to assess the likelihood of anomalous behav- ior. The efficacy of this approach is demonstrated on four datasets: ECG, space shuttle, power demand, and multi-sensor engine dataset.},
author = {Malhotra, Pankaj and Vig, Lovekesh and Shroff, Gautam and Agarwal, Puneet},
booktitle = {European Symposium on Artificial Neural Networks},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Long Short Term Memory Networks for Anomaly Detection in Time Series.pdf:pdf},
isbn = {9782875870148},
number = {April},
pages = {22--24},
title = {{Long Short Term Memory Networks for Anomaly Detection in Time Series}},
year = {2015}
}
@article{Chandola2009,
author = {Chandola, Varun and Banerjee, Arindam and Kumar, Vipin},
doi = {10.1145/1541880.1541882},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Anomaly Detection A Survey acm.pdf:pdf},
issn = {03600300},
journal = {ACM Computing Surveys},
keywords = {Anomaly detection,outlier detection},
month = {jul},
number = {3},
pages = {1--58},
title = {{Anomaly detection: A Survey}},
url = {http://portal.acm.org/citation.cfm?doid=1541880.1541882},
volume = {41},
year = {2009}
}
@article{Jones2014,
abstract = {We present a new algorithm for detecting anomalies in real- valued multidimensional time series. Our algorithm uses an exemplar-based model that is used to detect anomalies in single dimensions of the time series and a function that pre- dicts one dimension from a related one to detect anomalies in multiple dimensions. The algorithm is shown to work on a variety of different types of time series as well as to de- tect a variety of different types of anomalies. We compare our algorithm to other algorithms for both one-dimensional and multidimensional time series and demonstrate that it improves over the state-of-the-art.},
author = {Jones, Michael and Nikovski, Daniel and Imamura, Makoto and Hirata, Takahisa},
isbn = {9781625610003},
journal = {ASEBSC},
pages = {1--9},
title = {{Anomaly Detection in Real-Valued Multidimensional Time Series}},
url = {http://ase360.org/handle/123456789/56},
year = {2014}
}
@inproceedings{Bergstra2010,
abstract = {Theano is a compiler for mathematical expressions in Python that combines the convenience of NumPy's syntax with the speed of optimized native machine language. The user composes mathematical expressions in a high-level description that mimics NumPy's syntax and semantics, while being statically typed and functional (as opposed to imperative). These expressions allow Theano to provide symbolic differentiation. Before performing computation, Theano optimizes the choice of expressions, translates them into C++ (or CUDA for GPU), compiles them into dynamically loaded Python modules, all automatically. Common machine learning algorithms implemented with Theano are from 1:6 to 7:5 faster than competitive alternatives (including those implemented with C/C++, NumPy/SciPy and MATLAB) when compiled for the CPU and between 6:5 and 44 faster when compiled for the GPU. This paper illustrates how to use Theano, outlines the scope of the compiler, provides benchmarks on both CPU and GPU processors, and explains its overall design},
author = {Bergstra, James and Breuleux, Olivier and Bastien, Frederic and Lamblin, Pascal and Pascanu, Razvan and Desjardins, Guillaume and Turian, Joseph and Warde-Farley, David and Bengio, Yoshua},
booktitle = {9th Python in Science Conference},
file = {:C$\backslash$:/Users/Majid/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Bergstra et al. - 2010 - Theano a CPU and GPU math compiler in Python.pdf:pdf},
number = {Scipy},
pages = {1--7},
title = {{Theano: a CPU and GPU math compiler in Python}},
url = {http://www-etud.iro.umontreal.ca/{~}wardefar/publications/theano{\_}scipy2010.pdf},
year = {2010}
}
@article{Ng2006,
author = {Ng, B},
doi = {10.2172/900157},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Survey of Anomaly Detection Methods.pdf:pdf},
keywords = {and information science,computing,detection,distribution,general and miscellaneous//mathematics,lawrence livermore national laboratory},
title = {{Survey of Anomaly Detection Methods}},
url = {http://www.osti.gov/scitech/biblio/900157-VDshbd/},
year = {2006}
}
@article{Bengio1994,
abstract = {Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captured increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered.},
author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
doi = {10.1109/72.279181},
isbn = {1045-9227 VO  - 5},
issn = {10459227},
journal = {IEEE Transactions on Neural Networks},
number = {2},
pages = {157--166},
pmid = {18267787},
title = {{Learning long-term dependencies with gradient descent is difficult}},
volume = {5},
year = {1994}
}
@article{Szymanski2004,
abstract = { In this paper, a novel recursive data mining method based on the simple but powerful model of cognition called a conceptor is introduced and applied to computer security. The method recursively mines a string of symbols by finding frequent patterns, encoding them with unique symbols and rewriting the string using this new coding. We apply this technique to two related but important problems in computer security: (i) masquerade detection to prevent a security attack in which an intruder impersonates a legitimate user to gain access to the resources, and (ii) author identification, in which anonymous or disputed computer session needs to be attributed to one of a set of potential authors. Many methods based on automata theory, hidden Markov models, Bayesian models or even matching algorithms from bioinformatics have been proposed to solve the masquerading detection problem but less work has been done on the author identification. We used recursive data mining to characterize the structure and high-level symbols in user signatures and the monitored sessions. We used one-class SVM to measure the similarity of these two characterizations. We applied weighting prediction scheme to author identification. On the SEA dataset that we used in our experiments, the results were very promising.},
author = {Szymanski, B.K. and Zhang, Y.},
doi = {10.1109/IAW.2004.1437848},
isbn = {0-7803-8572-1},
journal = {Proceedings from the Fifth Annual IEEE SMC Information Assurance Workshop, 2004.},
keywords = {- masquerade detection,author identification,intrusion detection,one-class svm,recursive data mining},
number = {i},
pages = {424--431},
title = {{Recursive data mining for masquerade detection and author identification}},
year = {2004}
}
@inproceedings{Marchi2015,
author = {Marchi, Erik and Vesperini, Fabio and Eyben, Florian and Squartini, Stefano and Schuller, B},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
doi = {10.1109/ICASSP.2015.7178320},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/A NOVEL APPROACH FOR AUTOMATIC ACOUSTIC NOVELTY DETECTION USING A denoising autoencoder with bidirectional neural networks.pdf:pdf},
isbn = {9781467369978},
keywords = {Acoustic Novelty Detection,Bidirectional LSTM,Denoising Autoencorder,Feature extraction,Hidden Markov models,Noise reduction,Recurrent Neural Networks,Recurrent neural networks,Training,abnormal-novel acoustic signals,acoustic signal processing,auditory spectral features,automatic acoustic novelty detection,bidirectional LSTM neural networks,denoising autoencoder,long short term memory recurrent neural networks,novel unsupervised approach,recurrent neural nets,reference-normal data},
month = {apr},
number = {289021},
pages = {1996--2000},
title = {{A novel approach for automatic acoustic novelty detection using a denoising autoencoder with bidirectional LSTM neural networks}},
volume = {289021},
year = {2015}
}
@article{Sequeira2002,
abstract = {Security of computer systems is essential to their acceptance and utility. Computer security analysts use intrusion detection systems to assist them in maintaining computer system security. This paper deals with the problem of differentiating between masqueraders and the true user of a computer terminal. Prior efficient solutions are less suited to real time application, often requiring all training data to be labeled, and do not inherently provide an intuitive idea of what the data model means. Our system, called ADMIT, relaxes these constraints, by creating user profiles using semi-incremental techniques. It is a real-time intrusion detection system with host-based data collection and processing. Our method also suggests ideas for dealing with concept drift and affords a detection rate as high as 80.3{\%} and a false positive rate as low as 15.3{\%}.},
author = {Sequeira, Karlton and Zaki, Mohammed},
journal = {ACM SIGKDD international conference on Knowledge discovery and data mining},
pages = {386--395},
title = {{ADMIT: anomaly-based data mining for intrusions}},
url = {http://dl.acm.org/citation.cfm?id=775103},
year = {2002}
}
@article{Keogh2004a,
abstract = {Given the recent explosion of interest in streaming data and online algorithms, clustering of time seriessubsequences, extracted via a sliding window, has received much attention. In this work we make asurprising claim. Clustering of time series subsequences is meaningless. More concretely, clusters extractedfrom these time series are forced to obey a certain constraint that is pathologically unlikely to be satisfied byany dataset, and because of this, the clusters extracted by any clustering algorithm are essentially random.},
author = {Keogh, Eamonn and Keogh, Eamonn and Lin, Jessica and Lin, Jessica},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Clustering of Time Series Subsequences is Meaningless.pdf:pdf},
pmid = {670978},
title = {{Clustering of Time Series Subsequences is Meaningless:}},
url = {http://citeseer.ist.psu.edu/670978},
year = {2004}
}
@article{Ferdousi2006,
author = {Ferdousi, Z. and Maeda, a.},
doi = {10.1109/ICDEW.2006.157},
file = {:C$\backslash$:/Users/Majid/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Ferdousi, Maeda - 2006 - Unsupervised Outlier Detection in Time Series Data.pdf:pdf},
isbn = {0-7695-2571-7},
journal = {22nd International Conference on Data Engineering Workshops (ICDEW'06)},
keywords = {data mining,fraud detection,outlier detection,peer group analysis,series data,time},
pages = {x121--x121},
publisher = {Ieee},
title = {{Unsupervised Outlier Detection in Time Series Data}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=1623916},
year = {2006}
}
@inproceedings{Zhen2006,
abstract = {Due to their growing complexity, it becomes extremely difficult to detect and isolate faults in complex systems. While large amount of monitoring data can be collected from such systems for fault analysis, one challenge is how to correlate the data effectively across distributed systems and observation time. Much of the internal monitoring data reacts to the volume of user requests accordingly when user requests flow through distributed systems. In this paper, we use Gaussian mixture models to characterize probabilistic correlation between flow-intensities measured at multiple points. A novel algorithm derived from expectation-maximization (EM) algorithm is proposed to learn the "likely" boundary of normal data relationship, which is further used as an oracle in anomaly detection. Our recursive algorithm can adaptively estimate the boundary of dynamic data relationship and detect faults in real time. Our approach is tested in a real system with injected faults and the results demonstrate its feasibility},
author = {Zhen, Guo and Jiang, Guofei and Chen, Haifeng and Yoshihira, Kenji},
booktitle = {International Conference on Dependable Systems and Networks},
doi = {10.1109/DSN.2006.70},
isbn = {0-7695-2607-1},
keywords = {Gaussian distribution,Gaussian mixture model,anomaly detection,complex system,distributed system,expectation-maximisation algorithm,expectation-maximization algorithm,fault analysis,fault detection,fault diagnosis,fault tolerant computing,probabilistic correlation tracking,recursive algorithm,system monitoring},
pages = {259--268},
publisher = {IEEE},
title = {{Tracking probabilistic correlation of monitoring data for fault detection in complex systems}},
year = {2006}
}
@article{Cho2014,
abstract = {In this paper, we propose a novel neural network model called RNN Encoder--Decoder that consists of two recurrent neural networks (RNN). One RNN encodes a sequence of symbols into a fixed-length vector representation, and the other decodes the representation into another sequence of symbols. The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence. The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder--Decoder as an additional feature in the existing linear model. Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases.},
archivePrefix = {arXiv},
arxivId = {1406.1078},
author = {Cho, Kyunghyun and van Merrienboer, Bart and Gulcehre, Caglar and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},
doi = {10.3115/v1/D14-1179},
eprint = {1406.1078},
journal = {arXiv},
keywords = {decoder,for statistical machine translation,rning phrase representations using,rnn encoder},
pages = {1724--1734},
title = {{Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation}},
url = {http://arxiv.org/abs/1406.1078},
year = {2014}
}
@article{Thottan2010,
address = {London},
author = {Thottan, Marina and Liu, Guanglei and Ji, Chuanyi},
doi = {10.1007/978-1-84882-765-3},
editor = {Cormode, Graham and Thottan, Marina},
file = {:C$\backslash$:/Users/Majid/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Thottan, Liu, Ji - 2010 - Algorithms for Next Generation Networks(2).pdf:pdf},
isbn = {978-1-84882-764-6},
pages = {239--261},
publisher = {Springer London},
series = {Computer Communications and Networks},
title = {{Algorithms for Next Generation Networks}},
url = {http://link.springer.com/10.1007/978-1-84882-765-3},
year = {2010}
}
@inproceedings{knorr1997unified,
author = {Knorr, Edwin M and Ng, Raymond T},
booktitle = {Proceedings of the 1997 conference of the Centre for Advanced Studies on Collaborative research},
organization = {IBM Press},
pages = {11},
title = {{A unified approach for mining outliers}},
year = {1997}
}
@article{Mikolov2012,
abstract = {Recurrent neural network language models (RNNLMs) have recently demonstrated state-of-the-art performance across a variety of tasks. In this paper, we improve their performance by providing a contextual real-valued input vector in association with each word. This vector is used to convey contextual information about the sentence being modeled. By performing Latent Dirichlet Allocation using a block of preceding text, we achieve a topic-conditioned RNNLM. This approach has the key advantage of avoiding the data fragmentation associated with building multiple topic models on different data subsets. We report perplexity results on the Penn Treebank data, where we achieve a new state-of-the-art.We further apply the model to the Wall Street Journal speech recognition task, where we observe improvements in word-error-rate.},
author = {Mikolov, Tomas and Zweig, Geoffrey},
doi = {10.1109/SLT.2012.6424228},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/CONTEXT DEPENDENT RECURRENT NEURAL NETWORK LANGUAGE MODEL.pdf:pdf},
isbn = {978-1-4673-5126-3},
journal = {IEEE Workshop on Spoken Language Technology (SLT)},
keywords = {Latent Dirichlet Allocation,language modelling,recurrent neural networks,topic models},
pages = {234--239},
title = {{Context Dependent Recurrent Neural Network Language Model}},
year = {2012}
}
@thesis{Hochreiter,
author = {Hochreiter, S},
booktitle = {Master's thesis, Institut fur Informatik, Technische Universitat, Munchen},
title = {{Untersuchungen zu dynamischen neuronalen Netzen}},
url = {http://scholar.google.com/scholar?hl=en{\&}btnG=Search{\&}q=intitle:Untersuchungen+zu+dynamischen+neuronalen+Netzen{\#}0},
year = {1991}
}
@article{Keogh2001,
abstract = {The problem of similarity search in large time series databases has attracted much attention recently. It is a non-trivial problem because of the inherent high dimensionality of the data. The most promising solutions involve first performing dimensionality reduction on the data, and then indexing the reduced data with a spatial access method. Three major dimensionality reduction techniques have been proposed: Singular Value Decomposition (SVD), the Discrete Fourier transform (DFT), and more recently the Discrete Wavelet Transform (DWT). In this work we introduce a new dimensionality reduction technique which we call Piecewise Aggregate Approximation (PAA). We theoretically and empirically compare it to the other techniques and demonstrate its superiority. In addition to being competitive with or faster than the other methods, our approach has numerous other advantages. It is simple to understand and to implement, it allows more flexible distance measures, including weighted Euclidean queries, and the index can be built in linear time.},
author = {Keogh, Eamonn and Chakrabarti, Kaushik and Pazzani, Michael and Mehrotra, Sharad},
doi = {10.1007/PL00011669},
isbn = {1581133324},
issn = {0219-1377},
journal = {Knowledge and Information Systems},
keywords = {data mining,dimensionality reduction,indexing and retrieval,time series},
number = {3},
pages = {263--286},
pmid = {324778},
title = {{Dimensionality Reduction for Fast Similarity Search in Large Time Series Databases}},
url = {http://research.microsoft.com/pubs/79074/time{\_}series{\_}indexing.pdf},
volume = {3},
year = {2001}
}
@inproceedings{Gonzalez-Dominguez2014,
abstract = {This work explores the use of Long Short-Term Memory (LSTM) recurrent neural networks (RNNs) for automatic lan-guage identification (LID). The use of RNNs is motivated by their better ability in modeling sequences with respect to feed forward networks used in previous works. We show that LSTM RNNs can effectively exploit temporal dependencies in acoustic data, learning relevant features for language discrimination pur-poses. The proposed approach is compared to baseline i-vector and feed forward Deep Neural Network (DNN) systems in the NIST Language Recognition Evaluation 2009 dataset. We show LSTM RNNs achieve better performance than our best DNN system with an order of magnitude fewer parameters. Further, the combination of the different systems leads to significant per-formance improvements (up to 28{\%}).},
author = {Gonzalez-Dominguez, Javier and Lopez-Moreno, Ignacio and Sak, Hasim and Gonzalez-Rodriguez, Joaquin and Moreno, Pedro J.},
booktitle = {Interspeech-2014},
issn = {2308457X},
pages = {2155--2159},
title = {{Automatic Language Identification using Long Short-Term Memory Recurrent Neural Networks}},
year = {2014}
}
@article{Lane1999,
abstract = {The anomal-detection problem can be forrmulated as one of learning to characterize the behaviors of an individual, system, or network in terms of temporal sequences of discrete data. We present an approach on the basis of instrance-based learning (IBL) techniques. To cast the anomaly-detection task in an IBL framework, we employ an approach that transforms temporal sequences of discrete, unordered observations into a metric space via a similarity measure that encodes intra-attribue dependencies. Classification boundaries are selected from an a posteriori characterization of valid user bhaviours, coupled with a domain heurisitc. An empercial evaluation of the approach on user command data demonstrates that we can accurately differentiate the profiled user from alternative users when the avaiable features encode sufficient information. Furthermore, we demonstrate that the system detects anomalous conditions quickly - an important quality for reducing data storage requirements of the user profile, invluding instance-selection methods and clustering. An empirical evaluation shows that a new greedy clustering algorithm reduces the size of the user model by 70{\%}, with only a small loss in accuracy.},
author = {Lane, Terran and Brodley, Carla E.},
doi = {10.1145/322510.322526},
isbn = {1581130074},
issn = {10949224},
journal = {ACM Transactions on Information and System Security},
number = {3},
pages = {295--331},
title = {{Temporal sequence learning and data reduction for anomaly detection}},
volume = {2},
year = {1999}
}
@article{Cheboli2010a,
author = {Cheboli, Deepthi},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Anomaly Detection of Time Series.pdf:pdf},
title = {{Anomaly detection of time series}},
url = {http://udc.umn.edu/handle/11299/92985},
year = {2010}
}
@article{Lasaponara2006,
abstract = {In this work, we discuss the use of principal component analysis (PCA) for evaluating the vegetation interannual anomalies. The analysis was preformed on a temporal series (1999-2002) of the yearly Maximum Value Composit of SPOT/VEGETATION NDVI acquired for Sicily Island. The PCA was used as a data transform to enhance regions of localized change in multi-temporal data sets. This is a direct result of the high correlation that exists among images for regions that do not change significantly and the relatively low correlation associated with regions that change substantially. Both naturally vegetated areas (forest, shrub-land, herbaceous cover) and agricultural lands have been investigated in order to extract the most prominent natural and/or man-induced alterations affecting vegetation behavior. Our findings suggest that PCA can provide valuable information for environmental management policies involving biodiversity preservation and rational exploitation of natural and agricultural resources. ?? 2005 Elsevier B.V. All rights reserved.},
author = {Lasaponara, R.},
doi = {10.1016/j.ecolmodel.2005.10.035},
isbn = {0304-3800},
issn = {03043800},
journal = {Ecological Modelling},
keywords = {Change detection,Desertification,PCA,Satellite temporal series},
number = {4},
pages = {429--434},
pmid = {17952086},
title = {{On the use of principal component analysis (PCA) for evaluating interannual vegetation anomalies from Spot/Vegetation NDVI temporal series}},
volume = {194},
year = {2006}
}
@article{Zolhavarieh2014a,
abstract = {Clustering of subsequence time series remains an open issue in time series clustering. Subsequence time series clustering is used in different fields, such as e-commerce, outlier detection, speech recognition, biological systems, DNA recognition, and text mining. One of the useful fields in the domain of subsequence time series clustering is pattern recognition. To improve this field, a sequence of time series data is used. This paper reviews some definitions and backgrounds related to subsequence time series clustering. The categorization of the literature reviews is divided into three groups: preproof, interproof, and postproof period. Moreover, various state-of-the-art approaches in performing subsequence time series clustering are discussed under each of the following categories. The strengths and weaknesses of the employed methods are evaluated as potential issues for future studies.},
author = {Zolhavarieh, Seyedjamal and Aghabozorgi, Saeed and Teh, Ying Wah},
doi = {10.1155/2014/312521},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/A Review of Subsequence Time Series Clustering.pdf:pdf},
isbn = {1537-744x},
issn = {1537-744X},
journal = {TheScientificWorldJournal},
pages = {312521},
pmid = {25140332},
title = {{A Review of Subsequence Time Series Clustering.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=4130317{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {2014},
year = {2014}
}
@article{Borne,
author = {Borne, Kirk D},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/kborne-ML.pdf:pdf},
keywords = {data mining,data streams,outlier detection,space science,unsupervised learning},
pages = {1--26},
title = {{Effective Outlier Detection using K-Nearest Neighbor Data Distributions : Unsupervised Exploratory Mining of Non-Stationarity in Data Streams}}
}
@article{Graves2009,
abstract = {Recognizing lines of unconstrained handwritten text is a challenging task. The difficulty of segmenting cursive or overlapping characters, combined with the need to exploit surrounding context, has led to low recognition rates for even the best current recognizers. Most recent progress in the field has been made either through improved preprocessing or through advances in language modeling. Relatively little work has been done on the basic recognition algorithms. Indeed, most systems rely on the same hidden Markov models that have been used for decades in speech and handwriting recognition, despite their well-known shortcomings. This paper proposes an alternative approach based on a novel type of recurrent neural network, specifically designed for sequence labeling tasks where the data is hard to segment and contains long-range bidirectional interdependencies. In experiments on two large unconstrained handwriting databases, our approach achieves word recognition accuracies of 79.7 percent on online data and 74.1 percent on offline data, significantly outperforming a state-of-the-art HMM-based system. In addition, we demonstrate the network's robustness to lexicon size, measure the individual influence of its hidden layers, and analyze its use of context. Last, we provide an in-depth discussion of the differences between the network and HMMs, suggesting reasons for the network's superior performance.},
author = {Graves, Alex and Liwicki, Marcus and Fern{\'{a}}ndez, Santiago and Bertolami, Roman and Bunke, Horst and Schmidhuber, J{\"{u}}rgen},
doi = {10.1109/TPAMI.2008.137},
isbn = {0162-8828 (Print)$\backslash$r0098-5589 (Linking)},
issn = {01628828},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
keywords = {Bidirectional long short-term memory,Connectionist temporal classification,Handwriting recognition,Hidden Markov model,Offline handwriting,Online handwriting,Recurrent neural networks},
number = {5},
pages = {855--868},
pmid = {19299860},
title = {{A novel connectionist system for unconstrained handwriting recognition}},
volume = {31},
year = {2009}
}
@inproceedings{Bluche,
author = {Bluche, T and Louradour, J and Knibbe, M and Moysset, B and Benzeghiba, M F and Kermorvant, C},
booktitle = {Document Analysis Systems (DAS), 2014 11th IAPR International Workshop on},
doi = {10.1109/DAS.2014.40},
keywords = {A2iA Arabic handwritten text recognition system,Accuracy,Arabic handwriting recognition systems,Handwriting recognition,Hidden Markov models,LSTM recurrent neural networks,Large vocabulary Handwriting Recognition,OpenHaRT,OpenHaRT2013 evaluation,ROVER,ROVER combination algorithm,Recurrent Neural Networks,Recurrent neural networks,Text recognition,Training,Vocabulary,full paragraph recognition,handwriting recognition,long short-term memory,n-gram language modeling,natural language processing,recurrent neural nets,text detection,vocabulary selection techniques},
month = {apr},
pages = {161--165},
title = {{The A2iA Arabic Handwritten Text Recognition System at the Open HaRT2013 Evaluation}},
year = {2014}
}
@article{Snoek2012a,
abstract = {Machine learning algorithms frequently require careful tuning of model hyperparameters, regularization terms, and optimization parameters. Unfortunately, this tuning is often a "black art" that requires expert experience, unwritten rules of thumb, or sometimes brute-force search. Much more appealing is the idea of developing automatic approaches which can optimize the performance of a given learning algorithm to the task at hand. In this work, we consider the automatic tuning problem within the framework of Bayesian optimization, in which a learning algorithm's generalization performance is modeled as a sample from a Gaussian process (GP). The tractable posterior distribution induced by the GP leads to efficient use of the information gathered by previous experiments, enabling optimal choices about what parameters to try next. Here we show how the effects of the Gaussian process prior and the associated inference procedure can have a large impact on the success or failure of Bayesian optimization. We show that thoughtful choices can lead to results that exceed expert-level performance in tuning machine learning algorithms. We also describe new algorithms that take into account the variable cost (duration) of learning experiments and that can leverage the presence of multiple cores for parallel experimentation. We show that these proposed algorithms improve on previous automatic procedures and can reach or surpass human expert-level optimization on a diverse set of contemporary algorithms including latent Dirichlet allocation, structured SVMs and convolutional neural networks.},
archivePrefix = {arXiv},
arxivId = {1206.2944},
author = {Snoek, Jasper and Larochelle, Hugo and Adams, Ryan P.},
eprint = {1206.2944},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Practical Bayesian Optimization of Machine Learning Algorithms.pdf:pdf},
pages = {1--12},
title = {{Practical Bayesian Optimization of Machine Learning Algorithms}},
url = {http://arxiv.org/abs/1206.2944},
year = {2012}
}
@misc{Tieleman2012,
author = {Tieleman, T. and Hinton, Geoffrey},
booktitle = {COURSERA},
title = {{Neural Networks for Machine Learning (lecture 6.5)}},
year = {2012}
}
@inproceedings{kitaguchi2004extracting,
author = {Kitaguchi, S},
booktitle = {Proceedings of 18th Annual Conference of the Japanese Society for Artificial Intelligence (JSAI'04)},
title = {{Extracting feature based on motif from a chronic hepatitis dataset}},
year = {2004}
}
@article{Kriegel2009,
author = {Kriegel, Hans-Peter and Kr{\"{o}}ger, Peer and Zimek, Arthur},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/kdd10-outlier-tutorial.pdf:pdf},
journal = {Tutorial at the 13th Pacific-Asia Conference on Knowledge Discovery and Data Mining},
title = {{Outlier detection techniques}},
year = {2009}
}
@article{Keogh2005,
abstract = { In this work, we introduce the new problem of finding time series discords. Time series discords are subsequences of a longer time series that are maximally different to all the rest of the time series subsequences. They thus capture the sense of the most unusual subsequence within a time series. Time series discords have many uses for data mining, including improving the quality of clustering, data cleaning, summarization, and anomaly detection. Discords are particularly attractive as anomaly detectors because they only require one intuitive parameter (the length of the subsequence) unlike most anomaly detection algorithms that typically require many parameters. We evaluate our work with a comprehensive set of experiments. In particular, we demonstrate the utility of discords with objective experiments on domains as diverse as Space Shuttle telemetry monitoring, medicine, surveillance, and industry, and we demonstrate the effectiveness of our discord discovery algorithm with more than one million experiments, on 82 different datasets from diverse domains.},
author = {Keogh, Eamonn and Lin, Jessica and Fu, Ada},
doi = {10.1109/ICDM.2005.79},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/HOT SAX Efficiently Finding the Most Unusual Time Series Subsequence.pdf:pdf},
isbn = {0769522785},
issn = {15504786},
journal = {Proceedings - IEEE International Conference on Data Mining, ICDM},
keywords = {Anomaly detection,Clustering,Time series data mining},
pages = {226--233},
title = {{HOT SAX: Efficiently finding the most unusual time series subsequence}},
year = {2005}
}
@article{Kandhari2009,
author = {Kandhari, Rupali},
doi = {10.1145/1541880.1541882},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Anomaly Detection A Survey acm.pdf:pdf},
isbn = {0818663359},
issn = {03600300},
keywords = {Anomaly detection, outlier detection},
number = {3},
pages = {1--6},
pmid = {21834704},
title = {{Anomaly detection}},
volume = {41},
year = {2009}
}
@article{Dasgupta1996,
abstract = {Detecting anomalies in time series data is a problem of great practical interest in many manufacturing and signal processing applications. This paper presents a novelty detection algorithm inspired by the negative-selection mechanism of the immune system, which discriminates between self and other. Here self is defined to be normal data patterns and non-self is any deviation exceeding an allowable variation. An example application, simulated cutting dynamics in a milling operation, is presented, and the performance of the algorithm in detecting the tool breakage is reported.},
author = {Dasgupta, D. and Forrest, Stephanie},
journal = {Proceedings of the International Conference on Intelligent Systems},
pages = {82--87},
title = {{Novelty detection in time series data using ideas from immunology}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.57.3894{\&}rep=rep1{\&}type=pdf},
year = {1996}
}
@book{Seeger2004,
abstract = {Gaussian processes (GPs) are natural generalisations of multivariate Gaussian random variables to infinite (countably or continuous) index sets. GPs have been applied in a large number of fields to a diverse range of ends, and very many deep theoretical analyses of various properties are available. This paper gives an introduction to Gaussian processes on a fairly elementary level with special emphasis on characteristics relevant in machine learning. It draws explicit connections to branches such as spline smoothing models and support vector machines in which similar ideas have been investigated. Gaussian process models are routinely used to solve hard machine learning problems. They are attractive because of their flexible non-parametric nature and computational simplicity. Treated within a Bayesian framework, very powerful statistical methods can be implemented which offer valid estimates of uncertainties in our predictions and generic model selection procedures cast as nonlinear optimization problems. Their main drawback of heavy computational scaling has recently been alleviated by the introduction of generic sparse approximations.13,78,31 The mathematical literature on GPs is large and often uses deep concepts which are not required to fully understand most machine learning applications. In this tutorial paper, we aim to present characteristics of GPs relevant to machine learning and to show up precise connections to other "kernel machines" popular in the community. Our focus is on a simple presentation, but references to more detailed sources are provided.},
archivePrefix = {arXiv},
arxivId = {026218253X},
author = {Seeger, Matthias},
booktitle = {International journal of neural systems},
doi = {10.1142/S0129065704001899},
eprint = {026218253X},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Gaussian Processes for Machine Learning.pdf:pdf},
isbn = {026218253X},
issn = {0129-0657},
number = {2},
pages = {69--106},
pmid = {15112367},
title = {{Gaussian processes for machine learning.}},
volume = {14},
year = {2004}
}
@book{Minsky1967,
address = {Upper Saddle River, NJ, USA},
author = {Minsky, Marvin L},
isbn = {0-13-165563-9},
publisher = {Prentice-Hall, Inc.},
title = {{Computation: Finite and Infinite Machines}},
year = {1967}
}
@inproceedings{Munz2007,
abstract = {—Data mining techniques make it possible to search large amounts of data for characteristic rules and patterns. If applied to network monitoring data recorded on a host or in a network, they can be used to detect intrusions, attacks and/or anomalies. This paper gives an introduction to Network Data Mining, i.e. the application of data mining methods to packet and flow data captured in a network, including a comparative overview of existing approaches. Furthermore, we present a novel flow-based anomaly detection scheme based on the K-mean clus-tering algorithm. Training data containing unlabeled flow records are separated into clusters of normal and anomalous traffic. The corresponding cluster centroids are used as patterns for computationally efficient distance-based detection of anomalies in new monitoring data. We provide a detailed description of the data mining and the anomaly detection processes, and present first experimental results.},
author = {M{\"{u}}nz, Gerhard and Li, Sa and Carle, Georg},
booktitle = {GI/ITG Workshop MMBnet},
title = {{Traffic anomaly detection using k-means clustering}},
url = {http://www.decom.ufop.br/menotti/rp122/sem/sem3-luciano-art.pdf},
year = {2007}
}
@article{He2003,
abstract = {In this paper, we present a new definition for outlier: Cluster-based local outlier , which is meaningful and provides importance to the local data behavior. A measure for identifying the physical significance of an outlier is designed, which is called cluster-based local outlier factor (CBLOF). We also propose the Find CBLOF algorithm for discovering outliers. The experimental results show that our approach outperformed the existing methods on identifying meaningful and interesting outliers. ?? 2003 Elsevier Science B.V. All rights reserved.},
author = {He, Zengyou and Xu, Xiaofei and Deng, Shengchun},
doi = {10.1016/S0167-8655(03)00003-5},
isbn = {0167-8655},
issn = {01678655},
journal = {Pattern Recognition Letters},
keywords = {Clustering,Data mining,Outlier detection},
number = {9-10},
pages = {1641--1650},
title = {{Discovering cluster-based local outliers}},
volume = {24},
year = {2003}
}
@article{Doya1992,
abstract = {Gradient descent algorithms in recurrent neural networks can have$\backslash$nproblems when the network dynamics experience bifurcations in the course$\backslash$nof learning. The possible hazards caused by the bifurcations of the$\backslash$nnetwork dynamics and the learning equations are investigated. The roles$\backslash$nof teacher forcing, preprogramming of network structures, and the$\backslash$napproximate learning algorithms are discussed},
author = {Doya, K.},
doi = {10.1109/ISCAS.1992.230622},
isbn = {0-7803-0593-0},
issn = {0-7803-0593-0},
journal = {[Proceedings] 1992 IEEE International Symposium on Circuits and Systems},
pages = {1--4},
title = {{Bifurcations in the learning of recurrent neural networks}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.40.5278{\&}rep=rep1{\&}type=pdf},
volume = {6},
year = {1992}
}
@inproceedings{Angiulli2007,
abstract = {In this work a method for detecting distance-based outliers in data streams is presented. We deal with the sliding win- dow model, where outlier queries are performed in order to detect anomalies in the current window. Two algorithms are presented. The first one exactly answers outlier queries, but has larger space requirements. The second algorithm is directly derived from the exact one, has limited memory requirements and returns an approximate answer based on accurate estimations with a statistical guarantee. Several experiments have been accomplished, confirming the effec- tiveness of the proposed approach and the high quality of approximate solutions.},
author = {Angiulli, Fabrizio and Fassetti, Fabio},
booktitle = {Proceedings of the ACM Sixteenth Conference on Information and Knowledge Management},
doi = {10.1145/1321440.1321552},
isbn = {9781595938039},
pages = {811--820},
title = {{Detecting Distance-based Outliers in Streams of Data}},
year = {2007}
}
@article{Buckheit1995,
abstract = {Wavelab is a library of wavelet-packet analysis, cosine-packet analysis and matching pursuit. The library is available free of charge over the Internet. Versions are provided for Macintosh, UNIX and Windows machines. Wavelab makes available, in one package, all the code to reproduce all the figures in our published wavelet articles. The interested reader can inspect the source code to see exactly what algorithms were used, how parameters were set in producing our figures, and can then modify the source to produce variations on our results. WAVELAB has been developed, in part, because of exhortations by Jon Claerbout of Stanford that computational scientists should engage in “really reproducible” research.},
author = {Buckheit, Jb and Donoho, Dl},
doi = {10.1007/978-1-4612-2544-7},
isbn = {978-0-387-94564-4},
journal = {Wavelets and Statistics},
pages = {55--81},
title = {{WaveLab and Reproducible Research}},
url = {http://link.springer.com/chapter/10.1007/978-1-4612-2544-7{\_}5$\backslash$nhttp://link.springer.com/10.1007/978-1-4612-2544-7},
volume = {103},
year = {1995}
}
@article{Bu2007,
abstract = {Finding discords in time series database is an important problem in a great variety of applications, such as space shuttle telemetry, mechanical industry, biomedicine, and financial data analysis. However, most previous methods for this problem suffer from too many parameter settings which are difficult for users. The best known approach to our knowledge that has comparatively fewer parameters still requires users to choose a word size for the compression of subsequences. In this paper, we propose a Haar wavelet and augmented trie based algorithm to mine the top-K discords from a time series database, which can dynamically determine the word size for compression. Due to the characteristics of Haar wavelet transform, our algorithm has greater pruning power than previous approaches. Through experiments with some annotated datasets, the effectiveness and efficiency of our algorithm are both attested.},
author = {Bu, Yingyi and Leung, Tw and Fu, Awc and Keogh, Eamonn and Pei, Jian and Meshkin, Sam},
isbn = {9780898716306 (ISBN)},
journal = {In Proceedings of the 2007 SIAM International Conference on Data Mining (SDM'07)},
pages = {26--28},
title = {{Wat: Finding top-k discords in time series database}},
url = {http://www.cse.cuhk.edu.hk/{~}adafu/Pub/sdm07.pdf},
year = {2007}
}
@article{Romeu2013,
abstract = {Artificial neural networks have proved to be good at time-series forecasting problems, being widely studied at literature. Traditionally, shallow architectures were used due to convergence problems when dealing with deep models. Recent research findings enable deep architectures training, opening a new interesting research area called deep learning. This paper presents a study of deep learning techniques applied to time-series forecasting in a real indoor temperature forecasting task, studying performance due to different hyper-parameter configurations. When using deep models, better generalization performance at test set and an over-fitting reduction has been observed.},
author = {Romeu, Pablo},
doi = {10.1007/978-3-642-40728-4_57},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Time-Series Forecasting of Indoor Temperature Using Pre-trained Deep Neural Networks.pdf:pdf},
journal = {International Conference on Artificial Neural Networks},
keywords = {artificial neural networks,auto-,deep learning,encoders,energy efficiency,temperature forecasting,time series},
pages = {451--458},
title = {{Time-Series Forecasting of Indoor Temperature Using Pre-trained Deep Neural Networks}},
url = {http://link.springer.com/chapter/10.1007/978-3-642-40728-4{\_}57},
volume = {8131},
year = {2013}
}
@article{Martens2011,
abstract = {In this work we resolve the long-outstanding problem of how to effectively train recurrent neural networks (RNNs) on complex and difficult sequence modeling problems which may contain long-term data dependencies. Utilizing recent advances in the Hessian-free optimization approach (Martens, 2010), together with a novel damping scheme, we successfully train RNNs on two sets of challenging problems. First, a collection of pathological synthetic datasets which are known to be impossible for standard optimization approaches (due to their extremely long-term dependencies), and second, on three natural and highly complex real-world sequence datasets where we find that our method significantly outperforms the previous state-of-the-art method for training neural sequence models: the Long Short-term Memory approach of Hochreiter and Schmidhuber (1997). Additionally, we offer a new interpretation of the generalized Gauss-Newton matrix of sch (2002) which is used within the HF approach of Martens. Copyright 2011 by the author(s)/owner(s).},
author = {Martens, James and Sutskever, Ilya},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Learning Recurrent Neural Networks with Hessian-Free Optimization.pdf:pdf},
isbn = {978-1-4503-0619-5},
journal = {Proceedings of the 28th International Conference on Machine Learning, ICML 2011},
keywords = {Learning systems,Optimization},
pages = {1033--1040},
title = {{Learning recurrent neural networks with Hessian-free optimization}},
year = {2011}
}
@inproceedings{Lewandowski2010,
abstract = {A novel non-linear dimensionality reduction method, called Temporal Laplacian Eigenmaps, is introduced to process efficiently time series data. In this embedded-based approach, temporal information is intrinsic to the objective function, which produces description of low dimensional spaces with time coherence between data points. Since the proposed scheme also includes bidirectional mapping between data and embedded spaces and automatic tuning of key parameters, it offers the same benefits as mapping-based approaches. Experiments on a couple of computer vision applications demonstrate the superiority of the new approach to other dimensionality reduction method in term of accuracy. Moreover, its lower computational cost and generalisation abilities suggest it is scalable to larger datasets.},
address = {Istanbul, Turkey},
author = {Lewandowski, M. and Martinez-del-Rincon, J. and Makris, D. and Nebel, J. C.},
booktitle = {Proceedings - International Conference on Pattern Recognition},
doi = {10.1109/ICPR.2010.48},
isbn = {9780769541099},
issn = {10514651},
keywords = {Dimensionality reduction,Human motion,Manifold learning,Temporal Laplacian Eigenmap,Time-series},
pages = {161--164},
pmid = {5597623},
publisher = {IEEE – Institute of Electrical and Electronics Engineers},
title = {{Temporal extension of Laplacian Eigenmaps for unsupervised dimensionality reduction of time series}},
year = {2010}
}
@article{Basu2007,
abstract = {In this article we consider the problem of detecting unusual values or outliers from time series data where the process by which the data are created is difficult to model. The main consideration is the fact that data closer in time are more correlated to each other than those farther apart.We propose two varia- tions of a method that uses the median from a neighborhood of a data point and a threshold value to compare the difference between the median and the observed data value. Both variations of themethod are fast and can be used for data streams that occur in quick succession such as sensor data on an airplane.},
author = {Basu, Sabyasachi and Meckesheimer, Martin},
doi = {10.1007/s10115-006-0026-6},
isbn = {0219-1377},
issn = {02191377},
journal = {Knowledge and Information Systems},
keywords = {Jaccard coefficient,Outliers,Sensor data,Simulation,Time series},
number = {2},
pages = {137--154},
shorttitle = {Automatic outlier detection for time series},
title = {{Automatic outlier detection for time series: An application to sensor data}},
url = {http://link.springer.com/article/10.1007/s10115-006-0026-6},
volume = {11},
year = {2007}
}
@article{Yankov2008,
abstract = {The problem of finding unusual time series has recently attracted much attention, and several promising methods are now in the literature. However, virtually all proposed methods assume that the data reside in main memory. For many real-world problems this is not be the case. For example, in astronomy, multi-terabyte time series datasets are the norm. Most current algorithms faced with data which cannot fit in main memory resort to multiple scans of the disk/tape and are thus intractable. In this work we show how one particular definition of unusual time series, the time series discord, can be discovered with a disk aware algorithm. The proposed algorithm is exact and requires only two linear scans of the disk with a tiny buffer of main memory. Furthermore, it is very simple to implement. We use the algorithm to provide further evidence of the effectiveness of the discord definition in areas as diverse as astronomy, Web query mining, video surveillance, etc., and show the efficiency of our method on datasets which are many orders of magnitude larger than anything else attempted in the literature.},
author = {Yankov, Dragomir and Keogh, Eamonn and Rebbapragada, Umaa},
doi = {10.1007/s10115-008-0131-9},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Disk aware discord discovery finding unusual time series in terabyte sized datasets.pdf:pdf},
isbn = {0769530184},
issn = {02191377},
journal = {Knowledge and Information Systems},
keywords = {Discords,Disk aware algorithms,Distance outliers,Time series},
number = {2},
pages = {241--262},
title = {{Disk aware discord discovery: Finding unusual time series in terabyte sized datasets}},
volume = {17},
year = {2008}
}
@article{Breunig1999,
abstract = {For many KDD applications finding the outliers, i.e. the rare events,$\backslash$nis$\backslash$n$\backslash$nmore interesting and useful than finding the common cases, e.g. detecting$\backslash$ncriminal$\backslash$n$\backslash$nactivities in E-commerce. Being an outlier, however, is not just a$\backslash$nbinary property.$\backslash$n$\backslash$nInstead, it is a property that applies to a certain degree to each$\backslash$nobject in a data set,$\backslash$n$\backslash$ndepending on how ‘isolated' this object is, with respect to the surrounding$\backslash$nclus-$\backslash$n$\backslash$ntering structure. In this paper, we formally introduce a new notion$\backslash$nof outliers$\backslash$n$\backslash$nwhich bases outlier detection on the same theoretical foundation as$\backslash$ndensity-based$\backslash$n$\backslash$ncluster analysis. Our notion of an outlier is ‘local' in the sense$\backslash$nthat the outlier-de-$\backslash$n$\backslash$ngree of an object is determined by taking into account the clustering$\backslash$nstructure in$\backslash$n$\backslash$na bounded neighborhood of the object. We demonstrate that this notion$\backslash$nof an out-$\backslash$n$\backslash$nlier is more appropriate for detecting different types of outliers$\backslash$nthan previous ap-$\backslash$n$\backslash$nproaches, and we also present an algorithm for finding them. Furthermore,$\backslash$nwe$\backslash$n$\backslash$nshow that by combining the outlier detection with a density-based$\backslash$nmethod to an-$\backslash$n$\backslash$nalyze the clustering structure, we can get the outliers almost for$\backslash$nfree if we already$\backslash$n$\backslash$nwant to perform a cluster analysis on a data set.},
author = {Breunig, MM and Kriegel, HP and Ng, RT and Sander, J{\"{o}}rg},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/OPTICS-OF Identifying Local Outliers.pdf:pdf},
isbn = {3-540-66490-4},
journal = {Principles of Data Mining and {\ldots}},
pages = {262--270},
title = {{Optics-of: Identifying local outliers}},
url = {http://link.springer.com/chapter/10.1007/978-3-540-48247-5{\_}28},
year = {1999}
}
@article{Graves2005,
abstract = {In this paper, we present bidirectional Long Short Term Memory (LSTM) networks, and a modified, full gradient version of the LSTM learning algorithm. We evaluate Bidirectional LSTM (BLSTM) and several other network architectures on the benchmark task of framewise phoneme classification, using the TIMIT database. Our main findings are that bidirectional networks outperform unidirectional ones, and Long Short Term Memory (LSTM) is much faster and also more accurate than both standard Recurrent Neural Nets (RNNs) and time-windowed Multilayer Perceptrons (MLPs). Our results support the view that contextual information is crucial to speech processing, and suggest that BLSTM is an effective architecture with which to exploit it. {\textcopyright} 2005 Elsevier Ltd. All rights reserved.},
author = {Graves, Alex and Schmidhuber, J{\"{u}}rgen},
doi = {10.1109/IJCNN.2005.1556215},
isbn = {0780390482},
issn = {08936080},
journal = {Proceedings of the International Joint Conference on Neural Networks},
pages = {2047--2052},
pmid = {16112549},
title = {{Framewise phoneme classification with bidirectional LSTM networks}},
volume = {4},
year = {2005}
}
@phdthesis{IlyaSutskever2013,
author = {{Ilya Sutskever}},
booktitle = {Ph.D thesis},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/TRAINING RECURRENT NEURAL NETWORKS.pdf:pdf},
title = {{Training Recurrent Neural Networks}},
year = {2013}
}
@article{Beyer1999,
abstract = {We explore the effect of dimensionality on the “nearest neighbor” problem. We show that under a broad set of conditions (much broader than independent and identically distributed dimensions), as dimensionality increases, the distance to the nearest data point approaches the distance to the farthest data point. To provide a practical perspective, we present empirical results on both real and synthetic data sets that demonstrate that this effect can occur for as few as 10–15 dimensions. These results should not be interpreted to mean that high-dimensional indexing is never meaningful; we illustrate this point by identifying some high-dimensional workloads for which this effect does not occur. However, our results do emphasize that the methodology used almost universally in the database literature to evaluate high-dimensional indexing techniques is flawed, and should be modified. In particular, most such techniques proposed in the literature are not evaluated versus simple linear scan, and are evaluated over workloads for which nearest neighbor is not meaningful. Often, even the reported experiments, when analyzed carefully, show that linear scan would outperform the techniques being proposed on the workloads studied in high (10–15) dimensionality!},
author = {Beyer, Kevin and Goldstein, Jonathan and Ramakrishnan, Raghu and Shaft, Uri},
doi = {10.1007/3-540-49257-7_15},
isbn = {978-3-540-65452-0},
issn = {3540654526},
journal = {Database Theory—ICDT'99},
pages = {217--235},
title = {{When is “nearest neighbor” meaningful?}},
url = {http://link.springer.com/chapter/10.1007/3-540-49257-7{\_}15},
year = {1999}
}
@article{Wang2013,
abstract = {The previous decade has brought a remarkable increase of the interest in applications that deal with querying and mining of time series data. Many of the research efforts in this context have focused on introducing new representation methods for dimensionality reduction or novel similarity measures for the underlying data. In the vast majority of cases, each individual work introducing a particular method has made specific claims and, aside from the occasional theoretical justifications, provided quantitative experimental observations. However, for the most part, the comparative aspects of these experiments were too narrowly focused on demonstrating the benefits of the proposed methods over some of the previously introduced ones. In order to provide a comprehensive validation, we conducted an extensive experimental study re-implementing eight different time series representations and nine similarity measures and their variants, and testing their effectiveness on thirty-eight time series data sets from a wide variety of application domains. In this paper, we give an overview of these different techniques and present our comparative experimental findings regarding their effectiveness. In addition to providing a unified validation of some of the existing achievements, our experiments also indicate that, in some cases, certain claims in the literature may be unduly optimistic.},
archivePrefix = {arXiv},
arxivId = {1012.2789},
author = {Wang, Xiaoyue and Mueen, Abdullah and Ding, Hui and Trajcevski, Goce and Scheuermann, Peter and Keogh, Eamonn},
doi = {10.1007/s10618-012-0250-5},
eprint = {1012.2789},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Experimental comparison of representation methods and distance measures for time series data.pdf:pdf},
issn = {13845810},
journal = {Data Mining and Knowledge Discovery},
keywords = {Distance measure,Experimental comparison,Representation,Time series},
number = {2},
pages = {275--309},
title = {{Experimental comparison of representation methods and distance measures for time series data}},
volume = {26},
year = {2013}
}
@article{Li2007,
abstract = {Market analysis is a representative data analysis process with many$\backslash$napplications. In such an analysis, critical numerical measures, such$\backslash$nas profit and sales, fluctuate over time and form time-series data.$\backslash$nMoreover, the time series data correspond to market segments, which$\backslash$nare described by a set of attributes, such as age, gender, education,$\backslash$nincome level, and product-category, that form a multi-dimensional$\backslash$nstructure. To better understand market dynamics and predict future$\backslash$ntrends, it is crucial to study the dynamics of time-series in multi-dimensional$\backslash$nmarket segments. This is a topic that has been largely ignored in$\backslash$ntime series and data cube research. In this study, we examine the$\backslash$nissues of anomaly detection in multi-dimensional time-series data.$\backslash$nWe propose time-series data cube to capture the multi-dimensional$\backslash$nspace formed by the attribute structure. This facilitates the detection$\backslash$nof anomalies based on expected values derived from higher level,$\backslash$n"more general" time-series. Anomaly detection in a time-series data$\backslash$ncube poses computational challenges, especially for high-dimensional,$\backslash$nlarge data sets. To this end, we also propose an efficient search$\backslash$nalgorithm to iteratively select subspaces in the original high-dimensional$\backslash$nspace and detect anomalies within each one. Our experiments with$\backslash$nboth synthetic and real-world data demonstrate the effectiveness$\backslash$nand efficiency of the proposed solution.},
author = {Li, Xiaolei and Han, Jiawei},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Mining Approximate Top-K Subspace Anomalies in multidimensional time-series data.pdf:pdf},
isbn = {9781595936493},
issn = {{\textless}null{\textgreater}},
journal = {International conference on Very large data bases},
keywords = {anomalies in,ing approximate top-k subspace,multi-dimensional time-series data},
pages = {447--458},
title = {{Mining approximate top-k subspace anomalies in multi-dimensional time-series data}},
url = {http://dl.acm.org/citation.cfm?id=1325904},
year = {2007}
}
@article{Hofmeyr1998,
abstract = {A method is introducted for detecting intrusions at the level of privileged processes. Evidence is given that short sequences of system calls executed by running processes are a good discriminator between normal and abnormal operating characteristics of several common UNIX programs. Normal behavior is collected in two ways: Synthetically, by exercising as many normal modes of usage of a program as possible, and in a live user environment by tracing the actual execution of the program. In the former case several types of intrusive behavior were studied; in the latter case, results were analyzed for false positives. 1 Introduction Modern computer systems are plagued by security vulnerabilities. Whether it is the latest UNIX buffer overflow or bug in Microsoft Internet Explorer, our applications and operating systems are full of security flaws on many levels. From the viewpoint of the traditional security paradigm, it should be possible to eliminate such problems through more exten...},
author = {Hofmeyr, Steven A and Forrest, Stephanie and Somayaji, Anil},
doi = {10.1.1.43.6197},
isbn = {0926-227X},
issn = {0926227X},
journal = {Journal of Computer Security},
number = {3},
pages = {151--180},
pmid = {8041695},
title = {{Intrusion Detection using Sequences of System Calls}},
volume = {6},
year = {1998}
}
@article{Overturf2000a,
abstract = {ABSTRACT. Including the Use of Pneumococcal Conjugate and Polysaccharide Vaccines and Antibiotic Prophylaxis Pneumococcal infections are the most common invasive bacterial infections in children in the United States. The incidence of invasive pneumococcal infections peaks in children younger than 2 years, reaching rates of 228/100 000 in children 6 to 12 months old. Children with functional or anatomic asplenia (including sickle cell disease [SCD]) and children with human immunodeficiency virus infection have pneumococcal infection rates 20- to 100-fold higher than those of healthy children during the first 5 years of life. Others at high risk of pneumococcal infections include children with congenital immunodeficiency; chronic cardiopulmonary disease; children receiving immunosuppressive chemotherapy; children with immunosuppressive neoplastic diseases; children with chronic renal insufficiency, including nephrotic syndrome; children with diabetes; and children with cerebrospinal fluid leaks. Children of Native American (American Indian and Alaska Native) or African American descent also have higher rates of invasive pneumococcal disease. Outbreaks of pneumococcal infection have occurred with increased frequency in children attending out-of-home care. Among these children, nasopharyngeal colonization rates of 60{\%} have been observed, along with pneumococci resistant to multiple antibiotics. The administration of antibiotics to children involved in outbreaks of pneumococcal disease has had an inconsistent effect on nasopharyngeal carriage. In contrast, continuous penicillin prophylaxis in children younger than 5 years with SCD has been successful in reducing rates of pneumococcal disease by 84{\%}. Pneumococcal polysaccharide vaccines have been recommended since 1985 for children older than 2 years who are at high risk of invasive disease, but these vaccines were not recommended for younger children and infants because of poor antibody response before 2 years of age... [ABSTRACT FROM AUTHOR]},
author = {Overturf, Gary D},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Detecting Anomalies in a Time Series Database.pdf:pdf},
issn = {00314005},
journal = {Pediatrics},
keywords = {BACTERIAL vaccines,PNEUMOCOCCAL vaccine,STREPTOCOCCAL diseases,VACCINATION,VACCINATION of children},
number = {2},
pages = {367},
title = {{Technical Report.}},
url = {http://search.ebscohost.com/login.aspx?direct=true{\&}db=aph{\&}AN=3449077{\&}site=ehost-live},
volume = {106},
year = {2000}
}
@article{Keogh2004,
abstract = {Given the recent explosion of interest in streaming data and online algorithms, clustering of time seriessubsequences, extracted via a sliding window, has received much attention. In this work we make asurprising claim. Clustering of time series subsequences is meaningless. More concretely, clusters extractedfrom these time series are forced to obey a certain constraint that is pathologically unlikely to be satisfied byany dataset, and because of this, the clusters extracted by any clustering algorithm are essentially random.},
author = {Keogh, Eamonn and Keogh, Eamonn and Lin, Jessica and Lin, Jessica},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Clustering of Time Series Subsequences is Meaningless.pdf:pdf},
pmid = {670978},
title = {{Clustering of Time Series Subsequences is Meaningless:}},
url = {http://citeseer.ist.psu.edu/670978},
year = {2004}
}
@article{Qiao2002,
abstract = {An anomaly intrusion detection method based on HMM is presented. The system call trace of a UNIX privileged process is passed to a HMM to obtain state transition sequences. Preliminary experiments prove the state transition sequences can express the different mode between normal action and intrusion behaviour in a more stable and simple manner},
author = {Qiao, Y. and Xin, X.W. and Bin, Y. and Ge, S.},
doi = {10.1049/el:20020467},
isbn = {doi:10.1049/el:20020467},
issn = {00135194},
journal = {Electronics Letters},
keywords = {HMM,UNIX,Unix,anomaly intrusion detection method,experiments,hidden Markov model,hidden Markov models,normal action,privileged process,security of data,state transition sequences,system call trace},
number = {13},
pages = {663},
title = {{Anomaly intrusion detection method based on HMM}},
url = {http://digital-library.theiet.org/content/journals/10.1049/el{\_}20020467},
volume = {38},
year = {2002}
}
@article{Zaremba2014,
abstract = {We present a simple regularization technique for Recurrent Neural Networks (RNNs) with Long Short-Term Memory (LSTM) units. Dropout, the most successful technique for regularizing neural networks, does not work well with RNNs and LSTMs. In this paper, we show how to correctly apply dropout to LSTMs, and show that it substantially reduces overfitting on a variety of tasks. These tasks include language modeling, speech recognition, image caption generation, and machine translation.},
author = {Zaremba, Wojciech and Sutskever, Ilya and Vinyals, Oriol},
journal = {arXiv:1409.2329 [cs]},
title = {{Recurrent Neural Network Regularization}},
url = {http://arxiv.org/abs/1409.2329$\backslash$nhttp://www.arxiv.org/pdf/1409.2329.pdf},
year = {2014}
}
@article{Ratanamahatana2004,
abstract = {The Dynamic Time Warping (DTW) distance measure is a technique that has long been known in speech recognition community. It allows a non-linear mapping of one signal to another by minimizing the distance between the two. A decade ago, DTW was introduced into Data Mining community as a utility for various tasks for time series problems including classification, clustering, and anomaly detection. The technique has flourished, particularly in the last three years, and has been applied to a variety of problems in various disciplines. In spite of DTW's great success, there are still several persistent “myths” about it. These myths have caused confusion and led to much wasted research effort. In this work, we will dispel these myths with the most comprehensive set of time series experiments ever conducted},
author = {Ratanamahatana, Ca and Keogh, E},
doi = {10.1097/01.CCM.0000279204.24648.44},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Everything you know about Dynamic Time Warping is Wrong.pdf:pdf},
issn = {00903493},
journal = {Third Workshop on Mining Temporal and Sequential Data},
keywords = {data mining,dynamic time warping,experimentation},
pages = {22--25},
pmid = {15513920},
title = {{Everything you know about dynamic time warping is wrong}},
url = {http://spoken-number-recognition.googlecode.com/svn/trunk/docs/Dynamic time warping/DTW{\_}myths.pdf},
year = {2004}
}
@article{Bayer2014,
abstract = {Leveraging advances in variational inference, we propose to enhance recurrent neural networks with latent variables, resulting in Stochastic Recurrent Networks (STORNs). The model i) can be trained with stochastic gradient methods, ii) allows structured and multi-modal conditionals at each time step, iii) features a reliable estimator of the marginal likelihood and iv) is a generalisation of deterministic recurrent neural networks. We evaluate the method on four polyphonic musical data sets and motion capture data.},
archivePrefix = {arXiv},
arxivId = {1411.7610},
author = {Bayer, Justin and Osendorfer, Christian},
eprint = {1411.7610},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/LEARNING STOCHASTIC RECURRENT NETWORKS.pdf:pdf},
pages = {1--9},
title = {{Learning Stochastic Recurrent Networks}},
url = {http://arxiv.org/abs/1411.7610},
year = {2014}
}
@inproceedings{snoek2012practical,
author = {Snoek, Jasper and Larochelle, Hugo and Adams, Ryan P},
booktitle = {Advances in neural information processing systems},
pages = {2951--2959},
title = {{Practical Bayesian optimization of machine learning algorithms}},
year = {2012}
}
@article{PhysioNet,
annote = {Circulation Electronic Pages:
http://circ.ahajournals.org/cgi/content/full/101/23/e215
PMID:1085218; doi: 10.1161/01.CIR.101.23.e215},
author = {Goldberger, A L and Amaral, L A N and Glass, L and Hausdorff, J M and Ivanov, P Ch. and Mark, R G and Mietus, J E and Moody, G B and Peng, C.-K. and Stanley, H E},
journal = {Circulation},
number = {23},
pages = {e215----e220},
title = {{PhysioBank, PhysioToolkit, and PhysioNet: Components of a New Research Resource for Complex Physiologic Signals}},
volume = {101},
year = {2000}
}
@article{Keogh2007,
abstract = {In this work we introduce the new problem of finding time series dis- cords. Time series discords are subsequences of longer time series that are max- imally different to all the rest of the time series subsequences. They thus capture the sense of the most unusual subsequence within a time series. While discords have many uses for data mining, they are particularly attractive as anomaly de- tectors because they only require one intuitive parameter (the length of the sub- sequence) unlike most anomaly detection algorithms that typically require many parameters. While the brute force algorithm to discover time series discords is quadratic in the length of the time series, we show a simple algorithm that is three to four orders of magnitude faster than brute force, while guaranteed to produce identical results. We evaluate our work with a comprehensive set of experiments on diverse data sources including electrocardiograms, space telemetry, respiration physiology, anthropological and video datasets},
author = {Keogh, Eamonn and Lin, Jessica and Lee, Sang-Hee Hee and {Van Herle}, Helga},
doi = {10.1007/s10115-006-0034-6},
isbn = {0219-1377},
issn = {02191377},
journal = {Knowledge and Information Systems},
keywords = {Anomaly detection,Clustering,Time series data mining},
number = {1},
pages = {1--27},
title = {{Finding the most unusual time series subsequence: Algorithms and applications}},
volume = {11},
year = {2007}
}
@article{Fomel2013,
abstract = {The Madagascar software package is designed for analysis of large-scale multidimensional data, such as those occurring in exploration geophysics. Madagascar provides a framework for reproducible research. By “reproducible research” we refer to the discipline of attaching software codes and data to computational results reported in publications. The package contains a collection of (a) computational modules, (b) data-processing scripts, and (c) research papers. Madagascar is distributed on SourceForge under a GPL v2 license  https://sourceforge.net/projects/rsf/ . By October 2013, more than 70 people from different organizations around the world have contributed to the project, with increasing year-to-year activity. The Madagascar website is  http://www.ahay.org/ .},
author = {Fomel, Sergey and Sava, Paul and Vlad, Ioan and Liu, Yang and Bashkardin, Vladimir},
doi = {http://dx.doi.org/10.5334/jors.ag},
issn = {2049-9647},
journal = {Journal of Open Research Software},
keywords = {data analysis,geophysics,python,reproducibility,seismology},
number = {1},
pages = {e8},
title = {{Madagascar: open-source software project for multidimensional data analysis and reproducible computational experiments}},
url = {http://openresearchsoftware.metajnl.com/article/view/jors.ag/20},
volume = {1},
year = {2013}
}
@article{Birant2006,
abstract = {Outlier detection is one of the major data mining methods. This paper proposes a three-step approach to detect spatio-temporal outliers in large databases. These steps are clustering, checking spatial neighbors, and checking temporal neighbors. In this paper, we introduce a new outlier detection algorithm to find small groups of data objects that are exceptional when compared with rest large amount of data. In contrast to the existing outlier detection algorithms, new algorithm has the ability of discovering outliers according to the non-spatial, spatial and temporal values of the objects. In order to demonstrate the new algorithm, this paper also presents an example application using a data warehouse},
author = {Birant, Derya},
doi = {10.2498/cit.2006.04.04},
isbn = {953-7138-05-4},
issn = {13301136},
journal = {Journal of Computing and Information Technology},
keywords = {also,aspects,data,data mining,data warehouse,don,most studies,outlier detection,spatio temporal,t consider temporal,temporal outlier detection should},
number = {4},
pages = {291--297},
title = {{Spatio-Temporal Outlier Detection in Large Databases}},
url = {http://cit.zesoi.fer.hr/browsePaper.php?paper=952},
volume = {14},
year = {2006}
}
@article{Keogh2002,
abstract = {mining time series data. Literally hundreds of papers have introduced new algorithms to index, classify, cluster and segment time series. In this work we make the following claim. Much of this work has very little utility because the contribution made (speed in the case of indexing, accuracy in the case of classification and clustering, model accuracy in the case of segmentation) offer an amount of "improvement" that would have been completely dwarfed by the variance that would have been observed by testing on many real world datasets, or the variance that would have been observed by changing minor (unstated) implementation details.},
author = {Keogh, Eamonn and Kasetty, Shruti},
doi = {10.1145/775047.775062},
isbn = {158113567X},
journal = {Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining - KDD '02},
keywords = {Sonar,TS,data mining,experimental evaluation,time series},
pages = {102},
title = {{On the need for time series data mining benchmarks}},
url = {http://dl.acm.org/citation.cfm?id=775047.775062},
year = {2002}
}
@article{Siegelmann1993,
abstract = {The authors pursue a particular approach to analog computation,$\backslash$nbased on dynamical systems of the type used in neural networks research.$\backslash$nThe systems have a fixed structure, invariant in time, corresponding to$\backslash$nan unchanging number of `neurons'. If allowed exponential time for$\backslash$ncomputation, they turn out to have unbounded power. However, under$\backslash$npolynomial-time constraints there are limits on their capabilities,$\backslash$nthough being more powerful than Turing machines. These networks are not$\backslash$nlikely to solve polynomially-NP-hard problems, as the equality `P=NP'$\backslash$nimplies the almost complete collapse of the standard polynomial$\backslash$nhierarchy. In contrast to classical computational models, the models$\backslash$nstudied exhibit at least some robustness with respect to noise and$\backslash$nimplementation errors},
author = {Siegelmann, H.T. and Sontag, E.D.},
doi = {10.1109/ISTCS.1993.253479},
isbn = {0-8186-3630-0},
issn = {03043975},
journal = {[1993] The 2nd Israel Symposium on Theory and Computing Systems},
pages = {1--2},
pmid = {22009344},
title = {{Analog computation via neural networks}},
year = {1993}
}
@article{Wang2013b,
abstract = {The previous decade has brought a remarkable increase of the interest in applications that deal with querying and mining of time series data. Many of the research efforts in this context have focused on introducing new representation methods for dimensionality reduction or novel similarity measures for the underlying data. In the vast majority of cases, each individual work introducing a particular method has made specific claims and, aside from the occasional theoretical justifications, provided quantitative experimental observations. However, for the most part, the comparative aspects of these experiments were too narrowly focused on demonstrating the benefits of the proposed methods over some of the previously introduced ones. In order to provide a comprehensive validation, we conducted an extensive experimental study re-implementing eight different time series representations and nine similarity measures and their variants, and testing their effectiveness on thirty-eight time series data sets from a wide variety of application domains. In this paper, we give an overview of these different techniques and present our comparative experimental findings regarding their effectiveness. In addition to providing a unified validation of some of the existing achievements, our experiments also indicate that, in some cases, certain claims in the literature may be unduly optimistic.},
archivePrefix = {arXiv},
arxivId = {1012.2789},
author = {Wang, Xiaoyue and Mueen, Abdullah and Ding, Hui and Trajcevski, Goce and Scheuermann, Peter and Keogh, Eamonn},
doi = {10.1007/s10618-012-0250-5},
eprint = {1012.2789},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Experimental comparison of representation methods and distance measures for time series data.pdf:pdf},
issn = {13845810},
journal = {Data Mining and Knowledge Discovery},
keywords = {Distance measure,Experimental comparison,Representation,Time series},
number = {2},
pages = {275--309},
title = {{Experimental comparison of representation methods and distance measures for time series data}},
volume = {26},
year = {2013}
}
@article{Snoek2012,
abstract = {Machine learning algorithms frequently require careful tuning of model hyperparameters, regularization terms, and optimization parameters. Unfortunately, this tuning is often a "black art" that requires expert experience, unwritten rules of thumb, or sometimes brute-force search. Much more appealing is the idea of developing automatic approaches which can optimize the performance of a given learning algorithm to the task at hand. In this work, we consider the automatic tuning problem within the framework of Bayesian optimization, in which a learning algorithm's generalization performance is modeled as a sample from a Gaussian process (GP). The tractable posterior distribution induced by the GP leads to efficient use of the information gathered by previous experiments, enabling optimal choices about what parameters to try next. Here we show how the effects of the Gaussian process prior and the associated inference procedure can have a large impact on the success or failure of Bayesian optimization. We show that thoughtful choices can lead to results that exceed expert-level performance in tuning machine learning algorithms. We also describe new algorithms that take into account the variable cost (duration) of learning experiments and that can leverage the presence of multiple cores for parallel experimentation. We show that these proposed algorithms improve on previous automatic procedures and can reach or surpass human expert-level optimization on a diverse set of contemporary algorithms including latent Dirichlet allocation, structured SVMs and convolutional neural networks.},
archivePrefix = {arXiv},
arxivId = {1206.2944},
author = {Snoek, Jasper and Larochelle, Hugo and Adams, Ryan P.},
eprint = {1206.2944},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Practical Bayesian Optimization of Machine Learning Algorithms.pdf:pdf},
pages = {1--12},
title = {{Practical Bayesian Optimization of Machine Learning Algorithms}},
url = {http://arxiv.org/abs/1206.2944},
year = {2012}
}
@article{Bengio2012b,
abstract = {Learning algorithms related to artificial neural net- works and in particular for Deep Learning may seem to involve many bells and whistles, called hyper- parameters. This chapter is meant as a practical guide with recommendations for some of the most commonly used hyper-parameters, in particular in the context of learning algorithms based on back- propagated gradient and gradient-based optimiza- tion. It also discusses how to deal with the fact that more interesting results can be obtained when allow- ing one to adjust many hyper-parameters. Overall, it describes elements of the practice used to successfully and efficiently train and debug large-scale and often deep multi-layer neural networks. It closes with open questions about the training difficulties observedwith deeper architectures. 1},
archivePrefix = {arXiv},
arxivId = {1206.5533},
author = {Bengio, Yoshua},
doi = {10.1007/978-3-642-35289-8-26},
eprint = {1206.5533},
isbn = {9783642352881},
issn = {03029743},
journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
pages = {437--478},
pmid = {25497547},
title = {{Practical recommendations for gradient-based training of deep architectures}},
volume = {7700 LECTU},
year = {2012}
}
@article{laurent2015batch,
author = {Laurent, C{\'{e}}sar and Pereyra, Gabriel and Brakel, Phil{\'{e}}mon and Zhang, Ying and Bengio, Yoshua},
journal = {arXiv preprint arXiv:1510.01378},
title = {{Batch Normalized Recurrent Neural Networks}},
year = {2015}
}
@article{Graves2013b,
abstract = {This paper shows how Long Short-term Memory recurrent neural networks can be used to generate complex sequences with long-range structure, simply by predicting one data point at a time. The approach is demonstrated for text (where the data are discrete) and online handwriting (where the data are real-valued). It is then extended to handwriting synthesis by allowing the network to condition its predictions on a text sequence. The resulting system is able to generate highly realistic cursive handwriting in a wide variety of styles.},
archivePrefix = {arXiv},
arxivId = {arXiv:1308.0850v5},
author = {Graves, Alex},
eprint = {arXiv:1308.0850v5},
journal = {arXiv preprint arXiv:1308.0850},
pages = {1--43},
title = {{Generating sequences with recurrent neural networks}},
url = {http://arxiv.org/abs/1308.0850},
year = {2013}
}
@unpublished{Bengio-et-al-2015-Book,
annote = {Book in preparation for MIT Press},
author = {Bengio, Yoshua and Goodfellow, Ian J and Courville, Aaron},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/ch10 sequence modeling recurrent and recursive nets.pdf:pdf;:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/ch13 Linear Factor Models and Auto-Encoders.pdf:pdf},
title = {{Deep Learning}},
url = {http://www.iro.umontreal.ca/{~}bengioy/dlbook},
year = {2015}
}
@article{Alles2013a,
author = {Alles, Irina},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Time Series Clustering in the Field of Agronomy.pdf:pdf},
number = {September},
title = {{Time Series Clustering in the Field of Agronomy}},
url = {https://team.inria.fr/zenith/files/2013/11/ia{\_}ma{\_}thesis{\_}final.pdf},
year = {2013}
}
@article{Deng2014,
author = {Deng, Li},
doi = {10.1017/atsip.2013.9},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/A tutorial survey of architectures, algorithms, and applications for deep learning apsipa.pdf:pdf},
issn = {2048-7703},
journal = {APSIPA Transactions on Signal and Information Processing},
keywords = {algorithms,deep learning,information processing,received 3 february 2012,revised 2 december 2013},
number = {January 2014},
pages = {e2},
title = {{A tutorial survey of architectures, algorithms, and applications for deep learning}},
url = {http://www.journals.cambridge.org/abstract{\_}S2048770313000097},
volume = {3},
year = {2014}
}
@article{Romeu2013b,
author = {Romeu, Pablo and Zamora-Mart{\'{i}}nez, Francisco and Botella-Rocamora, Paloma and Pardo, Juan},
doi = {10.1007/978-3-642-40728-4_57},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Time-Series Forecasting of Indoor Temperature Using Pre-trained Deep Neural Networks.pdf:pdf},
isbn = {9783642407277},
issn = {03029743},
keywords = {artificial neural networks,auto-,deep learning,encoders,energy efficiency,temperature forecasting,time series},
pages = {451--458},
title = {{Time-Series Forecasting of Indoor Temperature Using Pre-trained Deep Neural Networks}},
url = {http://link.springer.com/10.1007/978-3-642-40728-4{\_}57},
year = {2013}
}
@article{Gentleman2004,
abstract = {The Bioconductor project is an initiative for the collaborative creation of extensible software for computational biology and bioinformatics. The goals of the project include: fostering collaborative development and widespread use of innovative software, reducing barriers to entry into interdisciplinary scientific research, and promoting the achievement of remote reproducibility of research results. We describe details of our aims and methods, identify current challenges, compare Bioconductor to other open bioinformatics projects, and provide working examples.},
author = {Gentleman, Robert C and Carey, Vincent J and Bates, Douglas M and Bolstad, Ben and Dettling, Marcel and Dudoit, Sandrine and Ellis, Byron and Gautier, Laurent and Ge, Yongchao and Gentry, Jeff and Hornik, Kurt and Hothorn, Torsten and Huber, Wolfgang and Iacus, Stefano and Irizarry, Rafael and Leisch, Friedrich and Li, Cheng and Maechler, Martin and Rossini, Anthony J and Sawitzki, Gunther and Smith, Colin and Smyth, Gordon and Tierney, Luke and Yang, Jean Y H and Zhang, Jianhua},
doi = {10.1186/gb-2004-5-10-r80},
isbn = {1465-6914 (Electronic)$\backslash$n1465-6906 (Linking)},
issn = {1465-6914},
journal = {Genome biology},
keywords = {Computational Biology,Computational Biology: instrumentation,Computational Biology: methods,Internet,Reproducibility of Results,Software},
number = {10},
pages = {R80},
pmid = {15461798},
title = {{Bioconductor: open software development for computational biology and bioinformatics.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=545600{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {5},
year = {2004}
}
@article{Zhu2003,
abstract = {Burst detection is the activity of finding abnormal aggregates in data streams. Such aggregates are based on sliding windows over data streams. In some applications, we want to monitor many sliding window sizes simultaneously and to report those windows with aggregates significantly different from other periods. We will present a general data structure for detecting interesting aggregates over such elastic windows in near linear time. We present applications of the algorithm for detecting Gamma Ray Bursts in large-scale astrophysical data. Detection of periods with high volumes of trading activities and high stock price volatility is also demonstrated using real time Trade and Quote (TAQ) data from the New York Stock Exchange (NYSE). Our algorithm beats the direct computation approach by several orders of magnitude. Copyright 2003 ACM.},
author = {Zhu, Yunyue and Shasha, Dennis},
journal = {Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining - KDD '03},
pages = {336},
title = {{Efficient elastic burst detection in data streams}},
url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-77952383186{\&}partnerID=tZOtx3y1},
year = {2003}
}
@inproceedings{Brueckner2014,
abstract = {Non-verbal speech cues play an important role in human communication such as expressing emotional states or maintaining the conversational flow. In this paper we investigate the effect of applying deep bidirectional Long Short-Term Memory (BLSTM) recurrent neural networks to the Interspeech 2013 Computational Paralinguistics Social Signals Sub-Challenge dataset requiring frame-wise, speaker-independent detection and classification of laughter and filler vocalizations in speech. BLSTM networks tend to prevail over conventional neural network architectures whenever the recognition or regression task relies on an intelligent exploitation of temporal context information. We introduce deep BLSTM models by stacking several BLSTMs and by combining non-recurrent deep neural networks with BLSTMs. We demonstrate that this new approach achieves significant improvements over previous attempts and we increase the current state-of-the-art unweighted average area-under-the-curve (UAAUC) value of 92.4{\%} to 94.0{\%}. This is the best result on this task reported in the literature so far.},
author = {Brueckner, Raymond and Schulter, Bjorn},
booktitle = {2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/ICASSP.2014.6854518},
isbn = {978-1-4799-2893-4},
keywords = {Context,Context modeling,Hidden Markov models,Long Short-Term Memory,Recurrent neural networks,Speech,Training,bidirectional long short term memory,deep BLSTM,deep BLSTM recurrent neural networks,paralinguistics,recurrent neural nets,recurrent neural networks,signal classification,social signal classification,speech processing,temporal context information},
pages = {4823--4827},
title = {{Social signal classification using deep blstm recurrent neural networks}},
year = {2014}
}
@article{Schubert2014,
abstract = {Outlier detection research has been seeing many new algorithms every year that often appear to be only slightly different from existing methods along with some experiments that show them to "clearly outperform" the others. However, few approaches come along with a clear analysis of existing methods and a solid theoretical differentiation. Here, we provide a formalized method of analysis to allow for a theoretical comparison and generalization of many existing methods. Our unified view improves understanding of the shared properties and of the differences of outlier detection models. By abstracting the notion of locality from the classic distance-based notion, our framework facilitates the construction of abstract methods for many special data types that are usually handled with specialized algorithms. In particular, spatial neighborhood can be seen as a special case of locality. Here we therefore compare and generalize approaches to spatial outlier detection in a detailed manner. We also discuss temporal data like video streams, or graph data such as community networks. Since we reproduce results of specialized approaches with our general framework, and even improve upon them, our framework provides reasonable baselines to evaluate the true merits of specialized approaches. At the same time, seeing spatial outlier detection as a special case of local outlier detection, opens up new potentials for analysis and advancement of methods. {\textcopyright} 2012 The Author(s).},
author = {Schubert, Erich and Zimek, Arthur and Kriegel, Hans Peter},
doi = {10.1007/s10618-012-0300-z},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Local outlier detection reconsidered a generalized view on locality with applications to spatial, video, and network outlier detection.pdf:pdf},
isbn = {1932-1872},
issn = {13845810},
journal = {Data Mining and Knowledge Discovery},
keywords = {Local outlier,Network outlier,Spatial outlier,Video outlier},
number = {1},
pages = {190--237},
title = {{Local outlier detection reconsidered: A generalized view on locality with applications to spatial, video, and network outlier detection}},
volume = {28},
year = {2014}
}
@article{Blum1992,
abstract = {We consider a 2-layer, 3-node, n-input neural network whose nodes compute linear threshold functions of their inputs. We show that it is NP-complete to decide whether there exist weights and thresholds for this network so that it produces output consistent with a given set of training examples. We extend the result to other simple networks. We also present a network for which training is hard but where switching to a more powerful representation makes training easier. These results suggest that those looking for perfect training algorithms cannot escape inherent computational difficulties just by considering only simple or very regular networks. They also suggest the importance, given a training problem, of finding an appropriate network and input encoding for that problem. It is left as an open problem to extend our result to nodes with nonlinear functions such as sigmoids.},
author = {Blum, Avrim L. and Rivest, Ronald L.},
doi = {10.1016/S0893-6080(05)80010-3},
isbn = {0-55869-019-5},
issn = {08936080},
journal = {Neural Networks},
keywords = {Computational complexity,Intractability,Learning,Multilayer perceptron,NP-completeness,Neural networks,Representation,Training},
number = {1},
pages = {117--127},
title = {{Training a 3-node neural network is NP-complete}},
url = {http://www.sciencedirect.com/science/article/pii/S0893608005800103},
volume = {5},
year = {1992}
}
@article{Pascanu2013,
abstract = {There are two widely known issues with prop- erly training Recurrent Neural Networks, the vanishing and the exploding gradient prob- lems detailed in Bengio et al. (1994). In this paper we attempt to improve the under- standing of the underlying issues by explor- ing these problems from an analytical, a geo- metric and a dynamical systems perspective. Our analysis is used to justify a simple yet ef- fective solution. We propose a gradient norm clipping strategy to deal with exploding gra- dients and a soft constraint for the vanishing gradients problem. We validate empirically our hypothesis and proposed solutions in the experimental section.},
archivePrefix = {arXiv},
arxivId = {arXiv:1211.5063v2},
author = {Pascanu, Razvan and Mikolov, Tomas and Bengio, Yoshua},
doi = {10.1109/72.279181},
eprint = {arXiv:1211.5063v2},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/On the difficulty of training recurrent neural networks.pdf:pdf},
issn = {1045-9227},
journal = {International Conference on Machine Learning},
number = {2},
pages = {1310--1318},
pmid = {18267787},
title = {{On the difficulty of training recurrent neural networks}},
url = {http://jmlr.org/proceedings/papers/v28/pascanu13.pdf},
year = {2013}
}
@article{Salvador2005,
abstract = {The normal operation of a device can be characterized in different temporal states. To identify these states, we introduce a segmentation algorithm called Gecko that can determine a reasonable number of segments using our proposed L method.We then use the RIPPER classification algorithm to describe these states in logical rules. Finally, transitional logic between the states is added to create a finite state automaton. Our empirical results, on data obtained from the NASA shuttle program, indicate that the Gecko segmentation algorithm is comparable to a human expert in identifying states, and our L method performs better than the existing permutation tests method when determining the number of segments to return in segmentation algorithms. Empirical results have also shown that our overall system can track normal behavior and detect anomalies},
author = {Salvador, Stan and Chan, Philip},
doi = {10.1007/s10489-005-4610-3},
issn = {0924669X},
journal = {Applied Intelligence},
keywords = {Anomaly detection,Cluster validation,Clustering,Segmentation,Time series},
pages = {241--255},
title = {{Learning states and rules for detecting anomalies in time series}},
volume = {23},
year = {2005}
}
@article{Jiang2006,
abstract = {With the prevalence of Internet services and the increase of their complexity, there is a growing need to improve their operational reliability and availability. While a large amount of monitoring data can be collected from systems for fault analysis, it is hard to correlate this data effectively across distributed systems and observation time. In this paper, we analyze the mass characteristics of user requests and propose a novel approach to model and track transaction flow dynamics for fault detection in complex information systems. We measure the flow intensity at multiple checkpoints inside the system and apply system identification methods to model transaction flow dynamics between these measurements. With the learned analytical models, a model-based fault detection and isolation method is applied to track the flow dynamics in real time for fault detection. We also propose an algorithm to automatically search and validate the dynamic relationship between randomly selected monitoring points. Our algorithm enables systems to have self-cognition capability for system management. Our approach is tested in a real system with a list of injected faults. Experimental results demonstrate the effectiveness of our approach and algorithms},
author = {Jiang, Guofei and Chen, Haifeng and Yoshihira, Kenji},
chapter = {0t Page},
doi = {10.1109/TDSC.2006.52},
isbn = {1545-5971},
issn = {1545-5971},
journal = {IEEE Transactions on Dependable and Secure Computing},
keywords = {Internet,Internet service,complex information system,data handling,distributed processing,distributed system,fault analysis,fault detection,fault diagnosis,information systems,monitoring,operational availability,operational reliability,self-cognition capability,system identification,system management,transaction flow dynamics,transaction processing},
number = {4},
pages = {312--326},
title = {{Modeling and tracking of transaction flow dynamics for fault detection in complex systems}},
volume = {3},
year = {2006}
}
@article{Ngkvist2014,
abstract = {PATTERN RECOGNITION LETTERS, 42 (2014) 11-24. doi:10.1016/j.patrec.2014.01.008},
author = {L{\"{a}}ngkvist, Martin and Karlsson, Lars and Loutfi, Amy and Ngkvist, Martin L {\~{A}} and Karlsson, Lars and Loutfi, Amy},
doi = {10.1016/j.patrec.2014.01.008},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/A review of unsupervised feature learning and deep learning for time-series modeling.pdf:pdf;:C$\backslash$:/Users/Majid/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/L{\"{a}}ngkvist, Karlsson, Loutfi - 2014 - A review of unsupervised feature learning and deep learning for time-series modeling q.pdf:pdf},
issn = {01678655},
journal = {Pattern Recognition Letters},
number = {C},
pages = {11--24},
title = {{A review of unsupervised feature learning and deep learning for time-series modeling}},
url = {http://dx.doi.org/10.1016/j.patrec.2014.01.008$\backslash$npapers3://publication/doi/10.1016/j.patrec.2014.01.008},
volume = {42},
year = {2014}
}
@article{Wei2005,
abstract = {Recent advancements in sensor technology have made it possible to collect enormous amounts of data in real time. However, because of the sheer volume of data most of it will never be inspected by an algorithm, much less a human being. One way to mitigate this problem is to perform some type of anomaly (novelty detection and flag unusual patterns for further inspection by humans or more CPU intensive algorithms. Most current solutions are custom made for particular domains, such as ECG monitoring, valve pressure monitoring, etc. This customization requires extensive effort by domain expert. Furthermore, hand-crafted systems tend to be very brittle to concept drift. In this demonstration, we will show an online anomaly detection system that does not need to be customized for individual domains, yet performs with exceptionally high precision/recall. The system is based on the recently introduced idea of time series bitmaps. To demonstrate the universality of our system, we will allow testing on independently annotated datasets from domains as diverse as ECGs, Space Shuttle telemetry monitoring, video surveillance, and respiratory data. In addition, we invite attendees to test our system with any dataset available on the web.},
author = {Wei, Li and Kumar, Nitin and Lolla, Venkata and Keogh, Eamonn and Lonardi, Stefano and Ann, Chotirat},
isbn = {188888111X},
journal = {Siam},
pages = {1--4},
title = {{Assumption-free anomaly detection in time series}},
url = {http://www.cs.ucr.edu/{~}wli/publications/WeiL{\_}AnomalyDetection.doc$\backslash$nwww.cs.ucr.edu/{~}ratana/SSDBM05.pdf},
year = {2005}
}
@inproceedings{Fan2015,
abstract = {Long short-term memory (LSTM) is a specific recurrent neural network (RNN) architecture that is designed to model temporal sequences and their long-range dependencies more accurately than conventional RNNs. In this paper, we propose to use deep bidirectional LSTM (BLSTM) for audio/visual modeling in our photo-real talking head system. An audio/visual database of a subject's talking is firstly recorded as our training data. The audio/visual stereo data are converted into two parallel temporal sequences, i.e., contextual label sequences obtained by forced aligning audio against text, and visual feature sequences by applying active-appearancemodel (AAM) on the lower face region among all the training image samples. The deep BLSTM is then trained to learn the regression model by minimizing the sum of square error (SSE) of predicting visual sequence from label sequence. After testing different network topologies, we interestingly found the best network is two BLSTM layers sitting on top of one feed-forward layer on our datasets. Compared with our previous HMM-based system, the newly proposed deep BLSTM is better on both objective measurement and subjective A/B test.},
author = {Fan, Bo and Wang, Lijuan and Soong, Frank K and Xie, Lei},
month = {apr},
publisher = {IEEE – Institute of Electrical and Electronics Engineers},
title = {{PHOTO-REAL TALKING HEAD WITH DEEP BIDIRECTIONAL LSTM}},
url = {http://research.microsoft.com/apps/pubs/default.aspx?id=238346},
year = {2015}
}
@article{Ng2006a,
author = {Ng, B},
doi = {10.2172/900157},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Survey of Anomaly Detection Methods.pdf:pdf},
keywords = {and information science,computing,detection,distribution,general and miscellaneous//mathematics,lawrence livermore national laboratory},
title = {{Survey of Anomaly Detection Methods}},
url = {http://www.osti.gov/scitech/biblio/900157-VDshbd/},
year = {2006}
}
@article{Sutskever2014,
abstract = {Deep Neural Networks (DNNs) are powerful models that have achieved excellent performance on difficult learning tasks. Although DNNs work well whenever large labeled training sets are available, they cannot be used to map sequences to sequences. In this paper, we present a general end-to-end approach to sequence learning that makes minimal assumptions on the sequence structure. Our method uses a multilayered Long Short-Term Memory (LSTM) to map the input sequence to a vector of a fixed dimensionality, and then another deep LSTM to decode the target sequence from the vector. Our main result is that on an English to French translation task from the WMT'14 dataset, the translations produced by the LSTM achieve a BLEU score of 34.8 on the entire test set, where the LSTM's BLEU score was penalized on out-of-vocabulary words. Additionally, the LSTM did not have difficulty on long sentences. For comparison, a phrase-based SMT system achieves a BLEU score of 33.3 on the same dataset. When we used the LSTM to rerank the 1000 hypotheses produced by the aforementioned SMT system, its BLEU score increases to 36.5, which is close to the previous best result on this task. The LSTM also learned sensible phrase and sentence representations that are sensitive to word order and are relatively invariant to the active and the passive voice. Finally, we found that reversing the order of the words in all source sentences (but not target sentences) improved the LSTM's performance markedly, because doing so introduced many short term dependencies between the source and the target sentence which made the optimization problem easier.},
archivePrefix = {arXiv},
arxivId = {1409.3215},
author = {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V.},
eprint = {1409.3215},
isbn = {1409.3215},
journal = {NIPS},
pages = {9},
title = {{Sequence to Sequence Learning with Neural Networks}},
url = {http://arxiv.org/abs/1409.3215},
year = {2014}
}
@inproceedings{Yuxiang2005,
abstract = {Outlier detecting is one of the most important data analysis technologies in data mining, which can be used to discover anomalous phenomena in huge dataset. Many literatures on spatial outlier detecting and time series outlier detecting have appeared, while the area of spatio-temporal outliers considering both spatial and temporal dimensions has still rarely been touched. Defining outliers in traditional dataset is more explicit because the data structure we need to focus on is very straightforward (e.g., a spatial point or a transaction record). However, it is much more difficult to give outlier a definite characterization in spatio-temporal lattice data, since there are so many data structures we can pay attention to. With the aim of detecting useful and meaningful outliers in climate dataset, we introduce a formalized way to define outliers in spatio-temporal lattice data, in which the importance of clarifying basic data structure (we call it basic element in our paper) is stressed. As a case study, we define two kinds of spatio-temporal outliers based on a global climate dataset, according to the three aspects we propose in defining an outlier. The introduction of basic element and the formulation of outlier definition process make it easier and clearer to define meaningful outliers. Thus outlier detecting in spatio-temporal lattice data will provide us with really interesting and useful knowledge.},
author = {Yuxiang, Sun and Kunqing, Xie and Xiujun, Ma and Xingxing, Jin and Wen, Pu and Xiaoping, Gao},
booktitle = {Proceedings. 2005 IEEE International Geoscience and Remote Sensing Symposium, 2005. IGARSS '05.},
doi = {10.1109/igarss.2005.1525218},
isbn = {0-7803-9050-4},
keywords = {outlier{\_}detection},
pages = {760--763},
title = {{Detecting spatio-temporal outliers in climate dataset: a method study}},
url = {http://dx.doi.org/10.1109/igarss.2005.1525218},
year = {2005}
}
@incollection{Dauphin,
author = {Dauphin, Yann and de Vries, Harm and Bengio, Yoshua},
booktitle = {Advances in Neural Information Processing Systems 28},
editor = {Cortes, C and Lawrence, N D and Lee, D D and Sugiyama, M and Garnett, R and Garnett, R},
pages = {1504--1512},
publisher = {Curran Associates, Inc.},
title = {{Equilibrated adaptive learning rates for non-convex optimization}},
url = {http://papers.nips.cc/paper/5870-equilibrated-adaptive-learning-rates-for-non-convex-optimization.pdf},
year = {2015}
}
@article{Stodden2013,
abstract = {Science is built upon foundations of theory and experiment validated and improved through open, trans-parent communication. With the increasingly central role of computation in scientific discovery this means communicating all details of the computations needed for others to replicate the experiment, i.e. making avail-able to others the associated data and code. The " reproducible research " movement recognizes that traditional scientific research and publication practices now fall short of this ideal, and encourages all those involved in the production of computational science – scientists who use computational methods and the institutions that employ them, journals and dissemination mechanisms, and funding agencies – to facilitate and practice really reproducible research. This report summarizes discussions that took place during the ICERM Workshop on Reproducibility in Computational and Experimental Mathematics, held December 10-14, 2012. The main recommendations that emerged from the workshop discussions are: 1. It is important to promote a culture change that will integrate computational reproducibility into the research process. 2. Journals, funding agencies, and employers should support this culture change. 3. Reproducible research practices and the use of appropriate tools should be taught as standard operat-ing procedure in relation to computational aspects of research. The workshop discussions included presentations of a number of the diverse and rapidly growing set of soft-ware tools available to aid in this effort. We call for a broad implementation of these three recommendations across the computational sciences.},
author = {Stodden, V. and Borwein, J and Bailey, D},
journal = {Computational Science Research. SIAM News},
pages = {4--6},
title = {{Setting the default to reproducible}},
volume = {46},
year = {2013}
}
@article{Ringberg2009,
abstract = {Unwanted traffic is a major concern in the Internet today. Unwanted traffic includes Denial of Service attacks, worms, and spam. Identifying and mitigating unwanted traffic costs businesses many billions of USD every year. The process of identifying this traffic is called anomaly detection, and Intrusion Detection Systems (IDS'es) are among the most prevalent techniques. IDS'es, such as Snort, allow users to write "rules" that specify the properties of traffic that should be detected and the corrective action to be taken in response. Unfortunately, applying these rules in an online setting can be prohibitively expensive for large networks, such as Tier-1 ISPs, which may have tens of thousands of links and many Gbps of traffic. In the first chapter of this thesis we present a system that leverages machine learning algorithms to detect the same type of unwanted traffic as Snort, but on summarized data for faster processing. Our results demonstrate that this system can effectively learn to classify many Snort rules with a high degree of accuracy.                 Unfortunately, distinguishing good traffic from unwanted traffic is challenging even in an offline setting because many types of unwanted traffic traffic, such as network attacks, deliberately mimic the behavior of normal traffic. We therefore propose that the targets of unwanted traffic should collaborate by correlating their attack data, under the assumption that a given malicious host is likely to affect more than one victim over time. That is, the senders of unwanted traffic will use individual computers ( i.e. , malicious hosts) repeatedly for various nefarious purposes in order to maximize their profits, and this repeated use will leave traces across networks. In the second chapter of this thesis we present a measurement study that quantifies the potential gain from this collaborative anomaly detection. Specifically, using traces from operational networks, we calculate the fraction of detected network anomalies ( viz. , IP scans, port scans, and DoS attacks) that could have been mitigated if some subset of the victims collaborated by sharing information about past perpetrators.                 One major challenge with the proposed collaborative anomaly detection is that the human owner/operators of participating networks are often hesitant to openly share information about the hosts (customers) that use their services. In the third chapter of the thesis we address this problem by proposing and evaluating the efficiency of a novel cryptographic protocol that allows victims to collaborate in a manner that protects their privacy. Our protocol allows participants to submit a set of IP addresses that they suspect might be engaging in unwanted activity, and it returns the set of IP addresses that existed in some fraction of all suspect sets ( i.e. , threshold set-intersection). The protocol preserves privacy because it never reveals who suspected whom, and a submitted IP address is only revealed when more than n  participating networks suspect it. Our implementation of said protocol is able to correlate millions of suspect IP addresses per hour when running on two quad-core machines.},
author = {Ringberg, Haakon Andreas},
isbn = {9781109378849},
journal = {ProQuest Dissertations and Theses},
keywords = {0984:Computer science,Anomaly detection,Applied sciences,Cryptography,IP addresses,Internet privacy,Intrusion detection systems},
number = {September},
pages = {128},
title = {{Privacy-preserving collaborative anomaly detection}},
url = {https://login.ctu.idm/oclc.org/?url=http://search.proquest.com/docview/304987587?accountid=26967 LA  - English},
year = {2009}
}
@article{Chuah2007,
abstract = {Recently, wireless sensor networks have been proposed for assisted living and residential monitoring. In such networks, physiological sensors are used to monitor vital signs e.g. heartbeats, pulse rates, oxygen saturation of senior citizens. Sensor data is sent periodically via wireless links to a personal computer that analyzes the data. In this paper, we propose an anomaly detection scheme based on time series analysis that will allow the computer to determine whether a stream of real-time sensor data contains any abnormal heartbeats. If anomaly exists, that time series segment will be transmitted via the network to a physician so that he/she can further diagnose the problem and take appropriate actions. When tested against the heartbeat data readings stored at the MIT database, our ECG anomaly scheme is shown to have better performance than another scheme that has been recently proposed. Our scheme enjoys an accuracy rate that varies from 70- 90{\%} while the other scheme has an accuracy that varies from 40-70{\%}},
author = {Chuah, MooiChoo and Fu, Fen},
doi = {10.1007/978-3-540-74767-3_14},
isbn = {978-3-540-74766-6},
journal = {Frontiers of High Performance Computing and Networking ISPA 2007 Workshops SE  - 14},
pages = {123--135},
title = {{ECG Anomaly Detection via Time Series Analysis}},
url = {http://dx.doi.org/10.1007/978-3-540-74767-3{\_}14},
volume = {4743},
year = {2007}
}
@article{Hill2010,
abstract = {The deployment of environmental sensors has generated an interest in real-time applications of the data they collect. This research develops a real-time anomaly detection method for environmental data streams that can be used to identify data that deviate from historical patterns. The method is based on an autoregressive data-driven model of the data stream and its corresponding prediction interval. It performs fast, incremental evaluation of data as it becomes available, scales to large quantities of data, and requires no pre-classification of anomalies. Furthermore, this method can be easily deployed on a large heterogeneous sensor network. Sixteen instantiations of this method are compared based on their ability to identify measurement errors in a windspeed data stream from Corpus Christi, Texas. The results indicate that a multilayer perceptron model of the data stream, coupled with replacement of anomalous data points, performs well at identifying erroneous data in this data stream. ?? 2009.},
author = {Hill, David J. and Minsker, Barbara S.},
doi = {10.1016/j.envsoft.2009.08.010},
isbn = {1364-8152},
issn = {13648152},
journal = {Environmental Modelling and Software},
keywords = {Anomaly detection,Artificial intelligence,Coastal environment,Data quality control,Data-driven modeling,Machine learning,Real-time data,Sensor networks},
number = {9},
pages = {1014--1022},
title = {{Anomaly detection in streaming environmental sensor data: A data-driven modeling approach}},
url = {http://dx.doi.org/10.1016/j.envsoft.2009.08.010},
volume = {25},
year = {2010}
}
@inproceedings{li2012dimensionality,
author = {Li, Zhinan and Xu, Wenyao and Huang, Anpeng and Sarrafzadeh, Majid},
booktitle = {Wearable and Implantable Body Sensor Networks (BSN), 2012 Ninth International Conference on},
organization = {IEEE},
pages = {161--165},
title = {{Dimensionality reduction for anomaly detection in electrocardiography: A manifold approach}},
year = {2012}
}
@article{Keogh2006,
abstract = {In this work, we introduce the new problem of finding time series discords. Time series discords are subsequences of longer time series that are maximally different to all the rest of the time series subsequences. They thus capture the sense of the most unusual subsequence within a time series. While discords have many uses for data mining, they are particularly attractive as anomaly detectors because they only require one intuitive parameter (the length of the subsequence), unlike most anomaly detection algorithms that typically require many parameters. While the brute force algorithm to discover time series discords is quadratic in the length of the time series, we show a simple algorithm that is three to four orders of magnitude faster than brute force, while guaranteed to produce identical results. We evaluate our work with a comprehensive set of experiments on electrocardiograms and other medical datasets.},
author = {Keogh, Eamonn and Lin, Jessica and Fu, Ada Waichee and {Van Herle}, Helga},
doi = {10.1109/TITB.2005.863870},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Finding Unusual Medical Time-Series Subsequences algorithms and applications.pdf:pdf},
issn = {1089-7771},
journal = {IEEE transactions on information technology in biomedicine : a publication of the IEEE Engineering in Medicine and Biology Society},
keywords = {Algorithms,Artificial Intelligence,Cluster Analysis,Computer Simulation,Diagnosis, Computer-Assisted,Diagnosis, Computer-Assisted: methods,Electrocardiography,Electrocardiography: methods,Humans,Information Storage and Retrieval,Information Storage and Retrieval: methods,Models, Biological,Pattern Recognition, Automated,Pattern Recognition, Automated: methods,Polysomnography,Polysomnography: methods,Reproducibility of Results,Sensitivity and Specificity,Time Factors},
number = {3},
pages = {429--39},
pmid = {16871709},
title = {{Finding unusual medical time-series subsequences: algorithms and applications.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/16871709},
volume = {10},
year = {2006}
}
@misc{Johnson2015,
author = {Johnson, Leif},
title = {theanets},
url = {http://theanets.readthedocs.org/en/stable/},
year = {2015}
}
@article{qiao2002anomaly,
author = {Qiao, Yan and Xin, X W and Bin, Yang and Ge, S},
journal = {Electronics Letters},
number = {13},
pages = {663--664},
publisher = {IET},
title = {{Anomaly intrusion detection method based on HMM}},
volume = {38},
year = {2002}
}
@article{Sak2014,
abstract = {Long Short-Term Memory (LSTM) is a recurrent neural network (RNN) architecture that has been designed to address the vanishing and exploding gradient problems of conventional RNNs. Unlike feedforward neural networks, RNNs have cyclic connections making them powerful for modeling sequences. They have been successfully used for sequence labeling and sequence prediction tasks, such as handwriting recognition, language modeling, phonetic labeling of acoustic frames. However, in contrast to the deep neural networks, the use of RNNs in speech recognition has been limited to phone recognition in small scale tasks. In this paper, we present novel LSTM based RNN architectures which make more effective use of model parameters to train acoustic models for large vocabulary speech recognition. We train and compare LSTM, RNN and DNN models at various numbers of parameters and configurations. We show that LSTM models converge quickly and give state of the art speech recognition performance for relatively small sized models.},
archivePrefix = {arXiv},
arxivId = {arXiv:1402.1128v1},
author = {Sak, Ha$\backslash$csim and Senior, Andrew and Beaufays, Fran{\c{c}}oise},
eprint = {arXiv:1402.1128v1},
journal = {arXiv preprint arXiv:1402.1128},
number = {Cd},
title = {{Long Short-Term Memory Based Recurrent Neural Network Architectures for Large Vocabulary Speech Recognition}},
year = {2014}
}
@inproceedings{Fan2014,
abstract = {Feed-forward, Deep neural networks (DNN)-based text-to-speech (TTS) systems have been recently shown to outperform decision-tree clustered context-dependent HMM TTS systems [1, 4]. However, the long time span contextual effect in a speech utterance is still not easy to accommodate, due to the intrinsic, feed-forward nature in DNN-based modeling. Also, to synthesize a smooth speech trajectory, the dynamic features are commonly used to constrain speech parameter trajectory generation in HMM-based TTS [2]. In this paper, Recurrent Neural Networks (RNNs) with Bidirectional Long Short Term Memory (BLSTM) cells are adopted to capture the correlation or co-occurrence information between any two instants in a speech utterance for parametric TTS synthesis. Experimental results show that a hybrid system of DNN and BLSTM-RNN, i.e., lower hidden layers with a feed-forward structure which is cascaded with upper hidden layers with a bidirectional RNN structure of LSTM, can outperform either the conventional, decision tree-based HMM, or a DNN TTS system, both objectively and subjectively. The speech trajectory generated by the BLSTM-RNN TTS is fairly smooth and no dynamic constraints are needed.},
author = {Fan, Yuchen and Qian, Yao and Xie, Fenglong and Soong, Frank K.},
booktitle = {Interspeech-2014},
keywords = {Bidirectional LSTM,Index Terms,deep neural network,hidden Markov model,recurrent neural network,statistical parametric speech synthesis},
pages = {1964--1968},
title = {{TTS Synthesis with Bidirectional LSTM based Recurrent Neural Networks}},
year = {2014}
}
@article{Aggarwal2013a,
author = {Aggarwal, CC},
doi = {10.1007/978-},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/High-dimensional Outlier Detection Survey.pdf:pdf},
journal = {Outlier Analysis},
title = {{High-Dimensional Outlier Detection: The Subspace Method}},
url = {http://link.springer.com/chapter/10.1007/978-1-4614-6396-2{\_}5},
year = {2013}
}
@article{Hill2010a,
abstract = {Abstract. WATER RESOURCES RESEARCH, VOL. 45, W00D28, 16 PP., 2009 doi: / . Real-time Bayesian anomaly detection in streaming environmental data.  Res., 45, W00D28, doi: / . Cited By. Please wait one moment ... },
author = {Hill, David J. and Minsker, Barbara S. and Amir, Eyal},
doi = {10.1029/2008WR006956},
issn = {00431397},
journal = {Water Resources Research},
number = {4},
pages = {1--16},
title = {{Real-time Bayesian anomaly detection in streaming environmental data}},
volume = {46},
year = {2010}
}
@inproceedings{jagadish1999mining,
author = {Jagadish, H V and Koudas, Nick and Muthukrishnan, S},
booktitle = {Proceedings of the 25th International Conference on Very Large Data Bases},
organization = {Morgan Kaufmann Publishers Inc.},
pages = {102--113},
title = {{Mining Deviants in a Time Series Database}},
year = {1999}
}
@article{Portnoy2001,
abstract = {Intrusions pose a serious security risk in a network environment. Although systems can be hardened against many types of intrusions, often intrusions are successful making systems for detecting these intrusions critical to the security of these system. New intrusion types, of which detection systems are unaware, are the most dicult to detect. Current signature based methods and learning algorithms which rely on labeled data to train, generally can not detect these new intrusions. In addition, labeled training data in order to train misuse and anomaly detection systems is typically very expensive. We present a new type of clustering-based intrusion detection algorithm, unsupervised anomaly detection, which trains on unlabeled data in order to detect new intrusions. In our system, no manually or otherwise classified data is necessary for training. Our method is able to detect many different types of intrusions, while maintaining a low false positive rate as verifed over the KDD CUP 1999 dataset.},
author = {Portnoy, L and Eskin, E and Stolfo, S},
doi = {10.1.1.13.7523},
journal = {Proceedings of ACM CSS Workshop on Data Mining Applied to Security Philadelphia PA},
pages = {1--25},
title = {{Intrusion detection with unlabeled data using clustering}},
url = {http://freeworld.thc.org/root/docs/intrusion{\_}detection/nids/ID-with-Unlabeled-Data-Using-Clustering.pdf},
year = {2001}
}
@article{Chakrabarti2002,
abstract = {Similarity search in large time series databases has attracted much research interest recently. It is a difficult problem because of the typically high dimensionality of the data.. The most promising solutions involve performing dimensionality reduction on the data, then indexing the reduced data with a multidimensional index structure. Many dimensionality reduction techniques have been proposed, including Singular Value Decomposition (SVD), the Discrete Fourier transform (DFT), and the Discrete Wavelet Transform (DWT). In this work we introduce a new dimensionality reduction technique which we call Adaptive Piecewise Constant Approximation (APCA). While previous techniques (e.g., SVD, DFT and DWT) choose a common representation for all the items in the database that minimizes the global reconstruction error, APCA approximates each time series by a set of constant value segments of varying lengths such that their individual reconstruction errors are minimal. We show how APCA can be indexed using a multidimensional index structure. We propose two distance measures in the indexed space that exploit the high fidelity of APCA for fast searching: a lower bounding Euclidean distance approximation, and a non-lower bounding, but very tight Euclidean distance approximation and show how they can support fast exact searching, and even faster approximate searching on the same index structure. We theoretically and empirically compare APCA to all the other techniques and demonstrate its superiority. Categories},
author = {Chakrabarti, Kaushik and Keogh, Eamonn and Mehrotra, Sharad and Pazzani, Michael},
doi = {10.1145/568518.568520},
isbn = {1581133324},
issn = {03625915},
journal = {ACM Transactions on Database Systems},
keywords = {content-based retrieval,dimensionality reduction,indexing},
number = {2},
pages = {188--228},
pmid = {324778},
title = {{Locally adaptive dimensionality reduction for indexing large time series databases}},
volume = {27},
year = {2002}
}
@article{Schuster1997,
abstract = {In the first part of this paper, a regular recurrent neural network (RNN) is extended to a bidirectional recurrent neural network (BRNN). The BRNN can be trained without the limitation of using input information just up to a preset future frame. This is accomplished by training it simultaneously in positive and negative time direction. Structure and training procedure of the proposed network are explained. In regression and classification experiments on artificial data, the proposed structure gives better results than other approaches. For real data, classification experiments for phonemes from the TIMIT database show the same tendency. In the second part of this paper, it is shown how the proposed bidirectional structure can be easily modified to allow efficient estimation of the conditional posterior probability of complete symbol sequences without making any explicit assumption about the shape of the distribution. For this part, experiments on real data are reported},
author = {Schuster, M. and Paliwal, K. K},
doi = {10.1109/78.650093},
issn = {1053-587X},
journal = {IEEE Transactions on Signal Processing},
number = {11},
pages = {2673--2681},
title = {{Bidirectional recurrent neural networks}},
url = {http://ieeexplore.ieee.org/xpls/abs{\_}all.jsp?arnumber=650093},
volume = {45},
year = {1997}
}
@article{Cheng2006,
abstract = {{\textless}P{\textgreater}{\textless}/P{\textgreater}A spatial outlier is a spatially referenced object whose thematic attribute values are significantly different from those of other spatially referenced objects in its spatial neighborhood. It represents an object that is significantly different from its neighbourhoods even though it may not be significantly different from the entire population. Here we extend this concept to the spatio-temporal domain and define a spatial-temporal outlier (ST-outlier) to be a spatial-temporal object whose thematic attribute values are significantly different from those of other spatially and temporally referenced objects in its spatial or/and temporal neighbourhoods. Identification of ST-outliers can lead to the discovery of unexpected, interesting, and implicit knowledge, such as local instability or deformation. Many methods have been recently proposed to detect spatial outliers, but how to detect the temporal outliers or spatial-temporal outliers has been seldom discussed. In this paper we propose a multiscale approach to detect ST-outliers by evaluating the change between consecutive spatial and temporal scales. A four-step procedure consisting of classification, aggregation, comparison and verification is put forward to address the semantic and dynamic properties of geographic phenomena for ST-outlier detection. The effectiveness of the approach is illustrated by a practical coastal geomorphic study.},
author = {Cheng, Tao and Li, Zhilin},
doi = {10.1111/j.1467-9671.2006.00256.x},
issn = {13611682},
journal = {Transactions in GIS},
number = {2},
pages = {253--263},
title = {{A multiscale approach for spatio-temporal outlier detection}},
url = {http://discovery.ucl.ac.uk/76163/},
volume = {10},
year = {2006}
}
@misc{Buduma2015,
author = {Buduma, Nikhil},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/A Deep Dive into Recurrent Neural Nets.pdf:pdf},
title = {{A Deep Dive into Recurrent Neural Nets}},
urldate = {2015-12-04},
year = {2015}
}
@article{Bengio2013,
abstract = {Recent work has shown how denoising and contractive autoencoders implicitly capture the structure of the data-generating density, in the case where the cor- ruption noise is Gaussian, the reconstruction error is the squared error, and the data is continuous-valued. This has led to various proposals for sampling from this implicitly learned density function, using Langevin and Metropolis-Hastings MCMC. However, it remained unclear how to connect the training procedure of regularized auto-encoders to the implicit estimation of the underlying data- generating distribution when the data are discrete, or using other forms of corrup- tion process and reconstruction errors. Another issue is the mathematical justifi- cation which is only valid in the limit of small corruption noise. We propose here a different attack on the problem, which deals with all these issues: arbitrary (but noisy enough) corruption, arbitrary reconstruction loss (seen as a log-likelihood), handling both discrete and continuous-valued variables, and removing the bias due to non-infinitesimal corruption noise (or non-infinitesimal contractive penalty). 1},
archivePrefix = {arXiv},
arxivId = {1305.6663},
author = {Bengio, Yoshua and Yao, Li and Alain, Guillaume and Vincent, Pascal},
eprint = {1305.6663},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Generalized Denoising Auto-Encoders as Generative Models.pdf:pdf},
isbn = {10495258},
journal = {Advances in Neural {\ldots}},
pages = {1--9},
title = {{Generalized denoising auto-encoders as generative models}},
url = {http://papers.nips.cc/paper/5023-generalized-denoising-auto-encoders-as-generative-models},
year = {2013}
}
@article{Overturf2000,
abstract = {ABSTRACT. Including the Use of Pneumococcal Conjugate and Polysaccharide Vaccines and Antibiotic Prophylaxis Pneumococcal infections are the most common invasive bacterial infections in children in the United States. The incidence of invasive pneumococcal infections peaks in children younger than 2 years, reaching rates of 228/100 000 in children 6 to 12 months old. Children with functional or anatomic asplenia (including sickle cell disease [SCD]) and children with human immunodeficiency virus infection have pneumococcal infection rates 20- to 100-fold higher than those of healthy children during the first 5 years of life. Others at high risk of pneumococcal infections include children with congenital immunodeficiency; chronic cardiopulmonary disease; children receiving immunosuppressive chemotherapy; children with immunosuppressive neoplastic diseases; children with chronic renal insufficiency, including nephrotic syndrome; children with diabetes; and children with cerebrospinal fluid leaks. Children of Native American (American Indian and Alaska Native) or African American descent also have higher rates of invasive pneumococcal disease. Outbreaks of pneumococcal infection have occurred with increased frequency in children attending out-of-home care. Among these children, nasopharyngeal colonization rates of 60{\%} have been observed, along with pneumococci resistant to multiple antibiotics. The administration of antibiotics to children involved in outbreaks of pneumococcal disease has had an inconsistent effect on nasopharyngeal carriage. In contrast, continuous penicillin prophylaxis in children younger than 5 years with SCD has been successful in reducing rates of pneumococcal disease by 84{\%}. Pneumococcal polysaccharide vaccines have been recommended since 1985 for children older than 2 years who are at high risk of invasive disease, but these vaccines were not recommended for younger children and infants because of poor antibody response before 2 years of age... [ABSTRACT FROM AUTHOR]},
author = {Overturf, Gary D},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Anomaly Detection A Survey report.pdf:pdf},
issn = {00314005},
journal = {Pediatrics},
keywords = {BACTERIAL vaccines,PNEUMOCOCCAL vaccine,STREPTOCOCCAL diseases,VACCINATION,VACCINATION of children},
number = {2},
pages = {367},
title = {{Technical Report.}},
url = {http://search.ebscohost.com/login.aspx?direct=true{\&}db=aph{\&}AN=3449077{\&}site=ehost-live},
volume = {106},
year = {2000}
}
@inproceedings{Pascanu2013c,
abstract = {There are two widely known issues with prop- erly training Recurrent Neural Networks, the vanishing and the exploding gradient prob- lems detailed in Bengio et al. (1994). In this paper we attempt to improve the under- standing of the underlying issues by explor- ing these problems from an analytical, a geo- metric and a dynamical systems perspective. Our analysis is used to justify a simple yet ef- fective solution. We propose a gradient norm clipping strategy to deal with exploding gra- dients and a soft constraint for the vanishing gradients problem. We validate empirically our hypothesis and proposed solutions in the experimental section.},
archivePrefix = {arXiv},
arxivId = {arXiv:1211.5063v2},
author = {Pascanu, Razvan and Mikolov, Tomas and Bengio, Yoshua},
booktitle = {International Conference on Machine Learning},
doi = {10.1109/72.279181},
eprint = {arXiv:1211.5063v2},
issn = {1045-9227},
number = {2},
pages = {1310--1318},
pmid = {18267787},
title = {{On the difficulty of training recurrent neural networks}},
url = {http://jmlr.org/proceedings/papers/v28/pascanu13.pdf},
year = {2013}
}
@article{Keogh2005a,
abstract = { In this work, we introduce the new problem of finding time series discords. Time series discords are subsequences of a longer time series that are maximally different to all the rest of the time series subsequences. They thus capture the sense of the most unusual subsequence within a time series. Time series discords have many uses for data mining, including improving the quality of clustering, data cleaning, summarization, and anomaly detection. Discords are particularly attractive as anomaly detectors because they only require one intuitive parameter (the length of the subsequence) unlike most anomaly detection algorithms that typically require many parameters. We evaluate our work with a comprehensive set of experiments. In particular, we demonstrate the utility of discords with objective experiments on domains as diverse as Space Shuttle telemetry monitoring, medicine, surveillance, and industry, and we demonstrate the effectiveness of our discord discovery algorithm with more than one million experiments, on 82 different datasets from diverse domains.},
author = {Keogh, Eamonn and Lin, Jessica and Fu, Ada},
doi = {10.1109/ICDM.2005.79},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/HOT SAX Efficiently Finding the Most Unusual Time Series Subsequence.pdf:pdf},
isbn = {0769522785},
issn = {15504786},
journal = {Proceedings - IEEE International Conference on Data Mining, ICDM},
keywords = {Anomaly detection,Clustering,Time series data mining},
pages = {226--233},
title = {{HOT SAX: Efficiently finding the most unusual time series subsequence}},
year = {2005}
}
@misc{Ferrel2005,
author = {Ferrel, B and Santuro, S},
title = {{NASA Shuttle Valve Data}},
url = {http://www.cs.fit.edu/{~}pkc/nasa/data},
year = {2005}
}
@inproceedings{Li2009,
abstract = {Outlier detection in vehicle traffic data is a practical problem that has gained traction lately due to an increasing capability to track moving vehicles in city roads. In contrast to other applications, this particular domain includes a very dynamic dimension: time. Many existing algorithms have studied the problem of outlier detection at a single instant in time. This study proposes a method for detecting temporal outliers with an emphasis on historical similarity trends between data points. Outliers are calculated from drastic changes in the trends. Experiments with real world traffic data show that this approach is effective and efficient.},
author = {Li, Xiaolei and Li, Zhenhui and Han, Jiawei and Lee, Jae-gil},
booktitle = {IEEE 25th International Conference on Data Engineering},
doi = {10.1109/ICDE.2009.230},
isbn = {978-1-4244-3422-0},
issn = {1084-4627},
keywords = {outlier},
number = {July},
pages = {1319--1322},
title = {{Temporal Outlier Detection in Vehicle Traffic Data}},
url = {http://web.engr.illinois.edu/{~}hanj/pdf/icde09{\_}xli.pdf},
year = {2009}
}
@article{Sutskever2013b,
abstract = {Recurrent Neural Networks (RNNs) are powerful sequence models that were believed to be difficult to train, and as a result they were rarely used in machine learning applications. This thesis presents methods that overcome the difficulty of training RNNs, and applications of RNNs to challenging problems. We first describe a newprobabilistic sequence model that combines Restricted Boltzmann Machines and RNNs. The new model is more powerful than similar models while being less difficult to train. Next, we present a new variant of the Hessian-free (HF) optimizer and show that it can train RNNs on tasks that have extreme long-range temporal dependencies, which were previously considered to be impossibly hard. We then apply HF to character-level language modelling and get excellent results. We also apply HF to optimal control and obtain RNN control laws that can successfully operate under conditions of delayed feedback and unknown disturbances. Finally, we describe a random parameter initialization scheme that allows gradient descent with mo- mentum to train RNNs on problems with long-term dependencies. This directly contradicts widespread beliefs about the inability of first-order methods to do so, and suggests that previous attempts at training RNNs failed partly due to flaws in the random initialization},
author = {Sutskever, Ilya and {Ilya Sutskever}},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/TRAINING RECURRENT NEURAL NETWORKS.pdf:pdf},
journal = {Ph.D thesis},
pages = {101},
title = {{Training Recurrent Neural Networks}},
year = {2013}
}
@article{Ratanamahatana2004a,
abstract = {The Dynamic Time Warping (DTW) distance measure is a technique that has long been known in speech recognition community. It allows a non-linear mapping of one signal to another by minimizing the distance between the two. A decade ago, DTW was introduced into Data Mining community as a utility for various tasks for time series problems including classification, clustering, and anomaly detection. The technique has flourished, particularly in the last three years, and has been applied to a variety of problems in various disciplines. In spite of DTW's great success, there are still several persistent “myths” about it. These myths have caused confusion and led to much wasted research effort. In this work, we will dispel these myths with the most comprehensive set of time series experiments ever conducted},
author = {Ratanamahatana, Ca and Keogh, E},
doi = {10.1097/01.CCM.0000279204.24648.44},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Everything you know about Dynamic Time Warping is Wrong.pdf:pdf},
issn = {00903493},
journal = {Third Workshop on Mining Temporal and Sequential Data},
keywords = {data mining,dynamic time warping,experimentation},
pages = {22--25},
pmid = {15513920},
title = {{Everything you know about dynamic time warping is wrong}},
url = {http://spoken-number-recognition.googlecode.com/svn/trunk/docs/Dynamic time warping/DTW{\_}myths.pdf},
year = {2004}
}
@article{Drosdowsky1993,
abstract = {A detailed analysis of the variability of Australian district rainfall on seasonal time-scales over the period 1950-1987 is described. This paper, Part I, describes the dominant spatial modes or patterns of variability. The major analysis tool is rotated principal component analysis (RPCA), used in both the S and T modes. Various criteria are examined to determine the number of components to rotate, including conducting trial rotations and comparison of the resulting patterns with the corresponding one-point correlation maps. The stability of the chosen solutions is examined by repeating the analysis on various subsets of the data. The S-mode analysis, which groups districts with similar temporal variation, provides a regionalization of the continent into eight coherent and approximately equally sized regions. The results of this analysis closely resemble those obtained from cluster analysis. The T-mode analysis clusters seasons with similar large-scale spatial variations (anomaly patterns). The similarity measure used in the T-mode analysis is the congruence coefficient, rather than the correlation or covariance. The patterns produced by this analysis consist of continental-scale anomalies, similar in some respects to the unrotated S-mode patterns, but more amenable to meteorological interpretation. In particular the first pattern, which accounts for approximately 25 per cent of the total variance consists of anomalies of the same sign over the entire continent centred on south-east Australia. The relationship between the two modes of representation is also explored. Regression equations are developed to express the spatially complex T-mode patterns in terms of the localized S-mode patterns, and alternatively, to partition the variance of each of the S-mode patterns between the T-mode components. In Part II the temporal variability and the relationship of these patterns to the Southern Oscillation and other large-scale circulation anomalies are examined.},
author = {Drosdowsky, W},
doi = {10.1002/joc.3370130102},
issn = {0899-8418},
journal = {International Journal of Climatology},
keywords = {AUSTRALIA,ROTATED PRINCIPAL COMPONENT ANALYSIS,S},
pages = {1--30},
title = {{An Analysis of Seasonal Rainfall Anomalies - 1950-1987}},
volume = {13},
year = {1993}
}
@article{Martens2011a,
abstract = {Recurrent Neural Networks (RNNs) are very powerful sequence models that do not enjoy widespread use because it is extremely diffi- cult to train them properly. Fortunately, re- cent advances in Hessian-free optimization have been able to overcome the difficulties associated with training RNNs, making it possible to apply them successfully to challenging sequence prob- lems. In this paper we demonstrate the power of RNNs trained with the new Hessian-Free op- timizer (HF) by applying them to character-level language modeling tasks. The standard RNN ar- chitecture, while effective, is not ideally suited for such tasks, so we introduce a new RNN variant that uses multiplicative (or gated) con- nections which allow the current input charac- ter to determine the transition matrix from one hidden state vector to the next. After training the multiplicative RNN with the HF optimizer for five days on 8 high-end Graphics Processing Units, we were able to surpass the performance of the best previous single method for character- level language modeling a hierarchical non- parametric sequence model. To our knowledge this represents the largest recurrent neural net- work application to date.},
author = {Martens, James},
doi = {2},
isbn = {9781450306195},
journal = {Neural Networks},
number = {1},
pages = {1017--1024},
title = {{Generating Text with Recurrent Neural Networks}},
url = {http://www.icml-2011.org/papers/524{\_}icmlpaper.pdf},
volume = {131},
year = {2011}
}
@article{Rumelhart1986,
abstract = {We describe a new learning procedure, back-propagation, for networks of neurone-like units. The procedure repeatedly adjusts the weights of the connections in the network so as to minimize a measure of the difference between the actual output vecotr of the net and the desired output vector. As a result of the weight adjustments, internal 'hidden' units wich are not part of the input or output come to represent important features of the task domain, and the regularities in the task are captured by the interactions of these units. The ability to create useful new features distinguishes back-propagation from earlier, simpoler methods such as the perceptron-convergence procedure.},
author = {Rumelhart, David E. and Hinton, Geoffrey E. and Williams, Ronald J.},
doi = {10.1038/323533a0},
isbn = {0262661160},
issn = {0028-0836},
journal = {Nature},
number = {6088},
pages = {533--536},
pmid = {134},
title = {{Learning representations by back-propagating errors}},
volume = {323},
year = {1986}
}
@article{Chandola2008,
abstract = {We present a comparative evaluation of a large number of anomaly detection techniques on a variety of publicly available as well as artificially generated data sets. Many of these are existing techniques while some are slight variants and/or adaptations of traditional anomaly detection techniques to sequence data.},
author = {Chandola, V. and Mithal, V. and Kumar, V.},
doi = {10.1109/ICDM.2008.151},
isbn = {978-0-7695-3502-9},
issn = {1550-4786},
journal = {2008 Eighth IEEE International Conference on Data Mining},
keywords = {Anomaly Detection,Sequences},
pages = {743--748},
title = {{Comparative Evaluation of Anomaly Detection Techniques for Sequence Data}},
year = {2008}
}
@article{Kriegel2010,
abstract = {In the field of wireless sensor networks, those measurements that significantly deviate from the normal pattern of sensed data are considered as outliers. The potential sources of outliers include noise and errors, events, and malicious attacks on the network. Traditional outlier detection techniques are not directly applicable to wireless sensor networks due to the nature of sensor data and specific requirements and limitations of the wireless sensor networks. This survey provides a comprehensive overview of existing outlier detection techniques specifically developed for the wireless sensor networks. Additionally, it presents a technique-based taxonomy and a comparative table to be used as a guideline to select a technique suitable for the application at hand based on characteristics such as data type, outlier type, outlier identity, and outlier degree.},
author = {Kriegel, Hans-peter and Kr{\"{o}}ger, Peer and Zimek, Arthur},
doi = {10.1109/SURV.2010.021510.00088},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/kdd10-outlier-tutorial.pdf:pdf},
isbn = {9781612081441},
issn = {1553877X},
journal = {IEEE Communications Surveys},
number = {2},
pages = {159--170},
title = {{Outlier Detection Techniques for Wireless Sensor Networks: A Survey}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8846{\&}rep=rep1{\&}type=pdf},
volume = {12},
year = {2010}
}
@misc{Colah2015,
author = {Colah},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Understanding LSTM Networks -- colah's blog.pdf:pdf},
title = {{Understanding LSTM Networks}},
urldate = {2015-12-04},
year = {2015}
}
@article{Bentley1997,
abstract = {We present theoretical algorithms for sorting and$\backslash$nsearching multikey data, and derive from them practical$\backslash$nC implementations for applications in which keys are$\backslash$ncharacter strings. The sorting algorithm blends$\backslash$nQuicksort and radix sort; it is competitive with the$\backslash$nbest known C sort codes. The searching algorithm blends$\backslash$ntries and binary search trees; it is faster than$\backslash$nhashing and other commonly used search methods. The$\backslash$nbasic ideas behind the algorithms date back at least to$\backslash$nthe 1960s, but their practical utility has been$\backslash$noverlooked. We also present extensions to more complex$\backslash$nstring problems, such as partial-match searching.},
author = {Bentley, Jl and Sedgewick, R},
doi = {http://dx.doi.org/10.1145/42411.42420},
isbn = {0-89871-390-0},
journal = {Proceedings of the eighth annual ACM-SIAM symposium on Discrete algorithms},
pages = {360--369},
title = {{Fast algorithms for sorting and searching strings}},
url = {http://dl.acm.org/citation.cfm?id=314321},
year = {1997}
}
@article{Wu2010,
abstract = {The detection of outliers from spatio-temporal data is an important task due to the increasing amount of spatio-temporal data available and the need to understand and interpret it. Due to the limitations of current data mining techniques, new techniques to handle this data need to be developed. We propose a spatio-temporal outlier detection algorithm called Outstretch, which discovers the outlier movement patterns of the top-k spatial outliers over several time periods. The top-k spatial outliers are found using the Exact-Grid Top- k and Approx-Grid Top- k algorithms, which are an extension of algorithms developed by Agarwal et al. [1]. Since they use the Kulldorff spatial scan statistic, they are capable of discovering all outliers, unaffected by neighbouring regions that may contain missing values. After generating the outlier sequences, we show one way they can be interpreted, by comparing them to the phases of the El Ni{\~{n}}o Southern Oscilliation (ENSO) weather phenomenon to provide a meaningful analysis of the results. {\textcopyright} 2010 Springer-Verlag Berlin Heidelberg.},
author = {Wu, Elizabeth and Liu, Wei and Chawla, Sanjay},
doi = {10.1007/978-3-642-12519-5_7},
isbn = {3642125182},
issn = {03029743},
journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
keywords = {Data Mining,Outlier Detection,Precipitation Extremes,South America,Spatio-Temporal},
pages = {115--133},
title = {{Spatio-temporal outlier detection in precipitation data}},
volume = {5840 LNCS},
year = {2010}
}
@article{Protopapas2005,
abstract = {We present a methodology to discover outliers in catalogs of periodic light-curves. We use cross-correlation as measure of ``similarity'' between two individual light-curves and then classify light-curves with lowest average ``similarity'' as outliers. We performed the analysis on catalogs of variable stars of known type from the MACHO and OGLE projects and established that our method correctly identifies light-curves that do not belong to those catalogs as outliers. We show how our method can scale to large datasets that will be available in the near future such as those anticipated from Pan-STARRS and LSST.},
archivePrefix = {arXiv},
arxivId = {astro-ph/0505495},
author = {Protopapas, P and Giammarco, J M and Faccioli, L and Struble, M F and Dave, R and Alcock, C},
doi = {10.1111/j.1365-2966.2006.10327.x},
eprint = {0505495},
isbn = {9780262013192},
issn = {00358711},
journal = {Monthly Notices of the Royal Astronomical Society},
keywords = {astronomical data bases,binaries,catalogues,cepheids,data analysis,eclipsing,methods,miscellaneous,other,stars,variables},
number = {2},
pages = {16},
primaryClass = {astro-ph},
title = {{Finding outlier light-curves in catalogs of periodic variable stars}},
url = {http://arxiv.org/abs/astro-ph/0505495},
volume = {369},
year = {2005}
}
@phdthesis{Alles2013,
author = {Alles, Irina},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Time Series Clustering in the Field of Agronomy.pdf:pdf},
number = {September},
school = {Darmstadt University of Technology},
title = {{Time Series Clustering in the Field of Agronomy}},
url = {https://team.inria.fr/zenith/files/2013/11/ia{\_}ma{\_}thesis{\_}final.pdf},
year = {2013}
}
@article{Zolhavarieh2014,
abstract = {Clustering of subsequence time series remains an open issue in time series clustering. Subsequence time series clustering is used in different fields, such as e-commerce, outlier detection, speech recognition, biological systems, DNA recognition, and text mining. One of the useful fields in the domain of subsequence time series clustering is pattern recognition. To improve this field, a sequence of time series data is used. This paper reviews some definitions and backgrounds related to subsequence time series clustering. The categorization of the literature reviews is divided into three groups: preproof, interproof, and postproof period. Moreover, various state-of-the-art approaches in performing subsequence time series clustering are discussed under each of the following categories. The strengths and weaknesses of the employed methods are evaluated as potential issues for future studies.},
author = {Zolhavarieh, Seyedjamal and Aghabozorgi, Saeed and Teh, Ying Wah},
doi = {10.1155/2014/312521},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/A Review of Subsequence Time Series Clustering.pdf:pdf},
isbn = {1537-744x},
issn = {1537-744X},
journal = {TheScientificWorldJournal},
pages = {312521},
pmid = {25140332},
title = {{A Review of Subsequence Time Series Clustering.}},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=4130317{\&}tool=pmcentrez{\&}rendertype=abstract},
volume = {2014},
year = {2014}
}
@article{Pascanu2013a,
abstract = {In this paper, we propose a novel way to extend a recurrent neural network (RNN) to a deep RNN. We start by arguing that the concept of the depth in an RNN is not as clear as it is in feedforward neural networks. By carefully analyzing and understanding the architecture of an RNN, we define three points which may be made deeper; (1) input-to-hidden function, (2) hidden-to-hidden transition and (3) hidden-to-output function. This can be considered in addition to stacking multiple recurrent layers proposed earlier by Schmidhuber (1992). Based on this observation, we propose two novel architectures of a deep RNN and provide an alternative interpretation of these deep RNN's using a novel framework based on neural operators. The proposed deep RNN's are empirically evaluated on the tasks of polyphonic music prediction and language modeling. The experimental result supports our claim that the proposed deep RNN's benefit from the depth and outperform the conventional, shallow RNN.},
archivePrefix = {arXiv},
arxivId = {1312.6026},
author = {Pascanu, Razvan and Gulcehre, Caglar and Cho, Kyunghyun and Bengio, Yoshua},
eprint = {1312.6026},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/How to Construct Deep Recurrent Neural Networks.pdf:pdf;:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/How to Construct Deep Recurrent Neural Networks.pdf:pdf},
journal = {arXiv preprint arXiv:1312.6026},
pages = {1--10},
title = {{How to Construct Deep Recurrent Neural Networks}},
url = {http://arxiv.org/abs/1312.6026},
year = {2013}
}
@inproceedings{fu2006finding,
author = {Fu, Ada Wai-chee and Leung, Oscar Tat-Wing and Keogh, Eamonn and Lin, Jessica},
booktitle = {Proceedings of the Second international conference on Advanced Data Mining and Applications},
organization = {Springer-Verlag},
pages = {31--41},
title = {{Finding time series discords based on haar transform}},
year = {2006}
}
@article{ZacharyC.Lipton2015,
abstract = {Countless learning tasks require dealing with sequential data. Image captioning, speech synthesis, music generation, and video game playing all require that a model generate sequences of outputs. In other domains, such as time series prediction, video analysis, and music information retrieval, a model must learn from sequences of inputs. Significantly more interactive tasks, such as natural language translation, engaging in dialogue, and robotic control, often demand both. Recurrent neural networks (RNNs) are a powerful family of connectionist models that capture time dynamics via cycles in the graph. Unlike feedforward neural networks, recurrent networks can process examples one at a time, retaining a state, or memory, that reflects an arbitrarily long context window. While these networks have long been difficult to train and often contain millions of parameters, recent advances in network architectures, optimization techniques, and parallel computation have enabled large-scale learning with recurrent nets. Over the past few years, systems based on state of the art long short-term memory (LSTM) and bidirectional recurrent neural network (BRNN) architectures have demonstrated record-setting performance on tasks as varied as image captioning, language translation, and handwriting recognition. In this review of the literature we synthesize the body of research that over the past three decades has yielded and reduced to practice these powerful models. When appropriate, we reconcile conflicting notation and nomenclature. Our goal is to provide a mostly self-contained explication of state of the art systems, together with a historical perspective and ample references to the primary research.},
archivePrefix = {arXiv},
arxivId = {1506.00019v2},
author = {{Zachary C. Lipton}},
eprint = {1506.00019v2},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/A Critical Review of Recurrent Neural Networks.pdf:pdf},
pages = {1--35},
title = {{A Critical Review of Recurrent Neural Networks for Sequence Learning}},
url = {http://arxiv.org/abs/1506.00019v2},
year = {2015}
}
@misc{Karpathy2015,
author = {Karpathy, Andrej},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/The Unreasonable Effectiveness of Recurrent Neural Networks.pdf:pdf},
title = {{The Unreasonable Effectiveness of Recurrent Neural Networks}},
url = {http://karpathy.github.io/2015/05/21/rnn-effectiveness/},
urldate = {2015-12-24},
year = {2015}
}
@article{Giardine2005,
abstract = {Accessing and analyzing the exponentially expanding genomic sequence and functional data pose a challenge for biomedical researchers. Here we describe an interactive system, Galaxy, that combines the power of existing genome annotation databases with a simple Web portal to enable users to search remote resources, combine data from independent queries, and visualize the results. The heart of Galaxy is a flexible history system that stores the queries from each user; performs operations such as intersections, unions, and subtractions; and links to other computational tools. Galaxy can be accessed at http://g2.bx.psu.edu.},
author = {Giardine, Belinda and Riemer, Cathy and Hardison, Ross C. and Burhans, Richard and Elnitski, Laura and Shah, Prachi and Zhang, Yi and Blankenberg, Daniel and Albert, Istvan and Taylor, James and Miller, Webb and Kent, W. James and Nekrutenko, Anton},
doi = {10.1101/gr.4086505},
isbn = {1088-9051 (Print)},
issn = {10889051},
journal = {Genome Research},
number = {10},
pages = {1451--1455},
pmid = {16169926},
title = {{Galaxy: A platform for interactive large-scale genome analysis}},
volume = {15},
year = {2005}
}
@article{Yankov2008a,
abstract = {The problem of finding unusual time series has recently attracted much attention, and several promising methods are now in the literature. However, virtually all proposed methods assume that the data reside in main memory. For many real-world problems this is not be the case. For example, in astronomy, multi-terabyte time series datasets are the norm. Most current algorithms faced with data which cannot fit in main memory resort to multiple scans of the disk/tape and are thus intractable. In this work we show how one particular definition of unusual time series, the time series discord, can be discovered with a disk aware algorithm. The proposed algorithm is exact and requires only two linear scans of the disk with a tiny buffer of main memory. Furthermore, it is very simple to implement. We use the algorithm to provide further evidence of the effectiveness of the discord definition in areas as diverse as astronomy, Web query mining, video surveillance, etc., and show the efficiency of our method on datasets which are many orders of magnitude larger than anything else attempted in the literature.},
author = {Yankov, Dragomir and Keogh, Eamonn and Rebbapragada, Umaa},
doi = {10.1007/s10115-008-0131-9},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Disk aware discord discovery finding unusual time series in terabyte sized datasets.pdf:pdf},
isbn = {0769530184},
issn = {02191377},
journal = {Knowledge and Information Systems},
keywords = {Discords,Disk aware algorithms,Distance outliers,Time series},
number = {2},
pages = {241--262},
title = {{Disk aware discord discovery: Finding unusual time series in terabyte sized datasets}},
volume = {17},
year = {2008}
}
@article{Graves2013,
abstract = {Recurrent neural networks (RNNs) are a powerful model for sequential data. End-to-end training methods such as Connectionist Temporal Classification make it possible to train RNNs for sequence labelling problems where the input-output alignment is unknown. The combination of these methods with the Long Short-term Memory RNN architecture has proved particularly fruitful, delivering state-of-the-art results in cursive handwriting recognition. However RNN performance in speech recognition has so far been disappointing, with better results returned by deep feedforward networks. This paper investigates $\backslash$emph{\{}deep recurrent neural networks{\}}, which combine the multiple levels of representation that have proved so effective in deep networks with the flexible use of long range context that empowers RNNs. When trained end-to-end with suitable regularisation, we find that deep Long Short-term Memory RNNs achieve a test set error of 17.7{\%} on the TIMIT phoneme recognition benchmark, which to our knowledge is the best recorded score.},
archivePrefix = {arXiv},
arxivId = {1303.5778},
author = {Graves, Alex and Mohamed, Abdel-rahman and Hinton, Geoffrey},
doi = {10.1109/ICASSP.2013.6638947},
eprint = {1303.5778},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/SPEECH RECOGNITION WITH DEEP RECURRENT NEURAL NETWORKS.pdf:pdf},
isbn = {978-1-4799-0356-6},
issn = {1520-6149},
journal = {Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on. IEEE},
number = {3},
pages = {6645--6649},
title = {{Speech Recognition with Deep Recurrent Neural Networks}},
url = {http://arxiv.org/abs/1303.5778},
year = {2013}
}
@article{Martens2012,
abstract = {In this chapter we will first describe the basic HF approach, and then examine well-known performance-improving techniques such as preconditioning which we have found to be beneficial for neural network training, as well as others of a more heuristic nature which are harder to justify, but which we have found to work well in practice. We will also provide practical tips for creating efficient and bug-free implementations and discuss various pitfalls which may arise when designing and using an HF-type approach in a particular application. {\&}copy; Springer-Verlag Berlin Heidelberg 2012.},
author = {Martens, James and Sutskever, Ilya},
doi = {10.1007/978-3-642-35289-8-27},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Training Deep and Recurrent Neural Networks with Hessian-Free Optimization.pdf:pdf},
isbn = {9783642352881},
issn = {03029743},
journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
pages = {479--535},
title = {{Training deep and recurrent networks with hessian-free optimization}},
volume = {7700 LECTU},
year = {2012}
}
@article{Lin2007,
abstract = {Many high level representations of time series have been proposed for data mining, including Fourier transforms, wavelets, eigenwaves, piecewise polynomial models, etc. Many researchers have also considered symbolic representations of time series, noting that such representations would potentiality allow researchers to avail of the wealth of data structures and algorithms from the text processing and bioinformatics communities. While many symbolic representations of time series have been introduced over the past decades, they all suffer from two fatal flaws. First, the dimensionality of the symbolic representation is the same as the original data, and virtually all data mining algorithms scale poorly with dimensionality. Second, although distance measures can be defined on the symbolic approaches, these distance measures have little correlation with distance measures defined on the original time series. In this work we formulate a new symbolic representation of time series. Our representation is unique in that it allows dimensionality/numerosity reduction, and it also allows distance measures to be defined on the symbolic approach that lower bound corresponding distance measures defined on the original series. As we shall demonstrate, this latter feature is particularly exciting because it allows one to run certain data mining algorithms on the efficiently manipulated symbolic representation, while producing identical results to the algorithms that operate on the original data. In particular, we will demonstrate the utility of our representation on various data mining tasks of clustering, classification, query by content, anomaly detection, motif discovery, and visualization.},
author = {Lin, Jessica and Keogh, Eamonn and Wei, Li and Lonardi, Stefano},
doi = {10.1007/s10618-007-0064-z},
isbn = {1384-5810},
issn = {13845810},
journal = {Data Mining and Knowledge Discovery},
keywords = {Data mining,Discretize,Symbolic representation,Time series},
number = {2},
pages = {107--144},
title = {{Experiencing SAX: A novel symbolic representation of time series}},
volume = {15},
year = {2007}
}
@book{Rasmussen2006,
archivePrefix = {arXiv},
arxivId = {026218253X},
author = {Rasmussen, Carl Edward and Williams, Christopher K I and {NetLibrary Inc.}},
booktitle = {Adaptive computation and machine learning},
doi = {10.1142/S0129065704001899},
eprint = {026218253X},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Gaussian Processes for Machine Learning.pdf:pdf},
isbn = {1423769902 (electronic bk.)},
issn = {0129-0657},
keywords = {Electronic books.,Gaussian processes Data processing.,Machine learning Mathematical models.},
pages = {xviii, 248 p.},
pmid = {15112367},
title = {{Gaussian processes for machine learning}},
url = {http://www.netlibrary.com/urlapi.asp?action=summary{\&}v=1{\&}bookid=156015},
year = {2006}
}
@article{Jaeger2005,
abstract = {This tutorial is a worked-out version of a 5-hour course originally held at AIS in September/October 2002. It has two distinct components. First, it contains a mathematically-oriented crash course on traditional training methods for recurrent neural networks, covering back-propagation through time (BPTT), real-time recurrent learning (RTRL), and extended Kalman filtering approaches (EKF). This material is covered in Sections 2 – 5. The remaining sections 1 and 6 – 9 are much more gentle, more detailed, and illustrated with simple examples. They are intended to be useful as a stand-alone tutorial for the echo state network (ESN) approach to recurrent neural network training. The},
author = {Jaeger, Herbert},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/a tutorial on training recurrent neural networks.pdf:pdf},
isbn = {159},
journal = {ReVision},
pages = {1--46},
title = {{A tutorial on training recurrent neural networks , covering BPPT , RTRL , EKF and the " echo state network " approach}},
url = {http://www.mendeley.com/catalog/tutorial-training-recurrent-neural-networks-covering-bppt-rtrl-ekf-echo-state-network-approach/},
volume = {2002},
year = {2005}
}
@article{Siegelmann1995,
abstract = {This paper deals with finite size networks which consist of interconnections of synchronously evolving processors. Each processor updates its state by applying a "sigmoidal" function to a linear combination of the previous states of all units. We prove that one may simulate all Turing machines by such nets. In particular, one can simulate any multi-stack Turing machine in real time, and there is a net made up of 886 processors which computes a universal partial-recursive function. Products (high order nets) are not required, contrary to what had been stated in the literature. Non-deterministic Turing machines can be simulated by non-deterministic rational nets, also in real time. The simulation result has many consequences regarding the decidability, or more generally the complexity, of questions about recursive nets.},
author = {Siegelmann, H.T. and Sontag, E.D.},
doi = {10.1006/jcss.1995.1013},
isbn = {089791497X},
issn = {00220000},
journal = {Journal of Computer and System Sciences},
number = {1},
pages = {132--150},
pmid = {23773339},
title = {{On the Computational Power of Neural Nets}},
url = {http://www.sciencedirect.com/science/article/pii/S0022000085710136},
volume = {50},
year = {1995}
}
@article{Hermans2013,
abstract = {Time series often have a temporal hierarchy, with information that is spread out over multiple time scales. Common recurrent neural networks, however, do not explicitly accommodate such a hierarchy, and most research on them has been focusing on training algorithms rather than on their basic architecture. In this pa- per we study the effect of a hierarchy of recurrent neural networks on processing time series. Here, each layer is a recurrent network which receives the hidden state of the previous layer as input. This architecture allows us to perform hi- erarchical processing on difficult temporal tasks, and more naturally capture the structure of time series. We show that they reach state-of-the-art performance for recurrent networks in character-level language modeling when trained with sim- ple stochastic gradient descent. We also offer an analysis of the different emergent time scales.},
author = {Hermans, Michiel and Schrauwen, Benjamin},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Training and Analyzing Deep Recurrent Neural networks.pdf:pdf},
issn = {10495258},
journal = {Advances in Neural Information Processing Systems},
keywords = {Recurrent Neural Networks},
pages = {190--198},
title = {{Training and Analyzing Deep Recurrent Neural Networks}},
year = {2013}
}
@inproceedings{Warrender1999,
abstract = {Intrusion detection systems rely on a wide variety of observable data to distinguish between legitimate and illegitimate activities. We study one such observable-sequences of system calls into the kernel of an operating system. Using system-call data sets generated by several different programs, we compare the ability of different data modeling methods to represent normal behavior accurately and to recognize intrusions. We compare the following methods: simple enumeration of observed sequences; comparison of relative frequencies of different sequences; a rule induction technique; and hidden Markov models (HMMs). We discuss the factors affecting the performance of each method and conclude that for this particular problem, weaker methods than HMMs are likely sufficient},
author = {Warrender, C. and Forrest, S. and Pearlmutter, B.},
booktitle = {Proceedings of the 1999 IEEE Symposium on Security and Privacy (Cat. No.99CB36344)},
pages = {133--145},
publisher = {IEEE Comput. Soc},
title = {{Detecting intrusions using system calls: alternative data models}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=766910},
year = {1999}
}
@article{Graphs1989a,
author = {Graphs, Unfolding Flow and Parameters, Sharing},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/ch10 sequence modeling recurrent and recursive nets.pdf:pdf;:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/ch10 sequence modeling recurrent and recursive nets.pdf:pdf},
pages = {191--235},
title = {{Sequence Modeling : Recurrent and Recursive Nets}},
year = {1989}
}
@article{Chan2015,
abstract = {We present a novel deep Recurrent Neural Network (RNN) model for acoustic modelling in Automatic Speech Recognition (ASR). We term our contribution as a TC-DNN-BLSTM-DNN model, the model combines a Deep Neural Network (DNN) with Time Convolution (TC), followed by a Bidirectional Long-Short Term Memory (BLSTM), and a final DNN. The first DNN acts as a feature processor to our model, the BLSTM then gen-erates a context from the sequence acoustic signal, and the final DNN takes the context and models the posterior probabilities of the acoustic states. We achieve a 3.47 WER on the Wall Street Journal (WSJ) eval92 task or more than 8{\%} relative improve-ment over the baseline DNN models},
archivePrefix = {arXiv},
arxivId = {1407.5949},
author = {Chan, William and Lane, Ian},
eprint = {1407.5949},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Deep Recurrent Neural Networks for Acoustic Modelling.pdf:pdf},
journal = {submitted Interspeech-2015},
keywords = {Neural and Evolutionary Computing},
title = {{Deep Recurrent Neural Networks for Acoustic Modelling}},
url = {http://arxiv.org/abs/1504.01482},
year = {2015}
}
@article{Florez-Larrahondo2005,
abstract = {Anomaly detection systems are developed by learning a baseline-model from a set of events captured from a computer system operating under normal conditions. The model is then used to recognize unusual activities as deviations from normality. Hidden Markov models (HMMs) are powerful probabilistic finite state machines that have been used to acquire these baseline-models. Although previous research has indicated that HMMs can effectively represent complex sequences, the traditional learning algorithm for HMMs is too computationally expensive for use with real-world anomaly detection systems. This paper describes the use of a novel incremental learning algorithm for HMMs that allows the efficient acquisition of anomaly detection models. The new learning algorithm requires less memory and training time than previous approaches for learning discrete HMMs and can be used to perform online learning of accurate baseline-models from complex computer applications to support anomaly detection.},
author = {Florez-Larrahondo, German and Bridges, Susan M and Vaughn, Rayford B},
journal = {Information Security},
pages = {506--514},
title = {{Efficient Modeling of Discrete Events for Anomaly Detection Using Hidden Markov Models}},
url = {http://dx.doi.org/10.1007/11556992{\_}38},
volume = {3650},
year = {2005}
}
@article{Greff2015,
abstract = {Several variants of the Long Short-Term Memory (LSTM) architecture for recurrent neural networks have been proposed since its inception in 1995. In recent years, these networks have become the state-of-the-art models for a variety of machine learning problems. This has led to a renewed interest in understanding the role and utility of various computational components of typical LSTM variants. In this paper, we present the first large-scale analysis of eight LSTM variants on three representative tasks: speech recognition, handwriting recognition, and polyphonic music modeling. The hyperparameters of all LSTM variants for each task were optimized separately using random search and their importance was assessed using the powerful fANOVA framework. In total, we summarize the results of 5400 experimental runs (about 15 years of CPU time), which makes our study the largest of its kind on LSTM networks. Our results show that none of the variants can improve upon the standard LSTM architecture significantly, and demonstrate the forget gate and the output activation function to be its most critical components. We further observe that the studied hyperparameters are virtually independent and derive guidelines for their efficient adjustment.},
archivePrefix = {arXiv},
arxivId = {1503.04069},
author = {Greff, Klaus and Srivastava, Rupesh Kumar and Koutn{\'{i}}k, Jan and Steunebrink, Bas R and Schmidhuber, J{\"{u}}rgen},
eprint = {1503.04069},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/LSTM A Search Space Odyssey.pdf:pdf;:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/LSTM A Search Space Odyssey.pdf:pdf},
journal = {arXiv},
pages = {10},
title = {{LSTM: A Search Space Odyssey}},
url = {http://arxiv.org/abs/1503.04069},
year = {2015}
}
@article{Chandola2009b,
abstract = {We present a comprehensive evaluation of a large number of semi-supervised anomaly detection techniques for time series data. Some of these are existing techniques and some are adaptations that have never been tried before. For example, we adapt the window based discord detection technique to solve this problem. We also investigate several techniques that detect anomalies in discrete sequences, by discretizing the time series data. We evaluate these techniques on a large variety of data sets obtained from a broad spectrum of application domains. The data sets have different char- acteristics in terms of the nature of normal time series and the nature of anomalous time series. We evaluate the tech- niques on di{\textregistered}erent metrics, such as accuracy in detecting the anomalous time series, sensitivity to parameters, and com- putational complexity, and provide useful insights regarding the e{\textregistered}ectiveness of di{\textregistered}erent techniques based on the exper- imental evaluation.},
author = {Chandola, Varun and Cheboli, Deepthi and Kumar, Vipin},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Detecting Anomalies in a Time Series Database.pdf:pdf},
journal = {{\ldots} Department, University of Minnesota, Tech. Rep},
pages = {12},
title = {{Detecting anomalies in a time series database}},
url = {http://scholar.google.com/scholar?hl=en{\&}btnG=Search{\&}q=intitle:Detecting+Anomalies+in+a+Time+Series+Database{\#}3},
year = {2009}
}
@article{Martens2010,
abstract = {We develop a 2nd-order optimization method based on the Hessian - free approach, and apply it to training deep auto-encoders. Without using pre-training, we obtain results superior to those reported by Hinton Salakhutdinov (2006) on the same tasks they considered. Our ...},
author = {Martens, James},
doi = {10.1155/2011/176802},
isbn = {9781605589077},
issn = {20901283},
journal = {27th International Conference on Machine Learning},
pages = {735--742},
pmid = {21512589},
title = {{Deep learning via Hessian-free optimization}},
url = {http://www.cs.toronto.edu/{~}asamir/cifar/HFO{\_}James.pdf},
volume = {951},
year = {2010}
}
@phdthesis{Lee2007,
author = {Lee, Honglak and Ekanadham, Chaitanya and Ng, AY},
booktitle = {NIPS},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Sparse deep belief net models for visual area V2 (BS thesis).pdf:pdf;:C$\backslash$:/Users/Majid/AppData/Local/Mendeley Ltd./Mendeley Desktop/Downloaded/Lee, Ekanadham, Ng - 2007 - Sparse deep belief net model for visual area V2(3).pdf:pdf},
pages = {1--8},
school = {Stanford},
title = {{Sparse deep belief net model for visual area V2.}},
url = {https://papers.nips.cc/paper/3313-sparse-deep-belief-net-model-for-visual-area-v2.pdf},
year = {2007}
}
@article{Keogh2006a,
abstract = {In this work, we introduce the new problem of finding time series discords. Time series discords are subsequences of longer time series that are maximally different to all the rest of the time series subsequences. They thus capture the sense of the most unusual subsequence within a time series. While discords have many uses for data mining, they are particularly attractive as anomaly detectors because they only require one intuitive parameter (the length of the subsequence), unlike most anomaly detection algorithms that typically require many parameters. While the brute force algorithm to discover time series discords is quadratic in the length of the time series, we show a simple algorithm that is three to four orders of magnitude faster than brute force, while guaranteed to produce identical results. We evaluate our work with a comprehensive set of experiments on electrocardiograms and other medical datasets.},
author = {Keogh, Eamonn and Lin, Jessica and Fu, Ada Waichee and {Van Herle}, Helga},
doi = {10.1109/TITB.2005.863870},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Finding Unusual Medical Time-Series Subsequences algorithms and applications.pdf:pdf},
issn = {1089-7771},
journal = {IEEE transactions on information technology in biomedicine : a publication of the IEEE Engineering in Medicine and Biology Society},
keywords = {Algorithms,Artificial Intelligence,Cluster Analysis,Computer Simulation,Diagnosis, Computer-Assisted,Diagnosis, Computer-Assisted: methods,Electrocardiography,Electrocardiography: methods,Humans,Information Storage and Retrieval,Information Storage and Retrieval: methods,Models, Biological,Pattern Recognition, Automated,Pattern Recognition, Automated: methods,Polysomnography,Polysomnography: methods,Reproducibility of Results,Sensitivity and Specificity,Time Factors},
number = {3},
pages = {429--39},
pmid = {16871709},
title = {{Finding unusual medical time-series subsequences: algorithms and applications.}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/16871709},
volume = {10},
year = {2006}
}
@article{Lu2004,
abstract = {In this paper, a wavelet fuzzy classification approach is proposed to detect and track region outliers in meteorological data. First wavelet transform is applied to meteorological data to bring up distinct patterns that might be hidden within the original data. Then a powerful image processing technique, edge detection with competitive fuzzy classifier, is extended to identify the boundary of region outlier. After that, to determine the center of the region outlier, the fuzzy-weighted average of the longitudes and latitudes of the boundary locations is computed. By linking the centers of the outlier regions within consecutive frames, the movement of a region outlier can be captured and traced. Experimental evaluation was conducted on a real-world meteorological data to examine the effectiveness of the proposed approach. This work will help discover interesting and implicit information for large volume of meteorological data.},
author = {Lu, Chang T and Liang, Lily R},
doi = {10.1145/1032222.1032260},
isbn = {1-58113-979-9},
journal = {GIS '04: Proceedings of the 12th annual ACM international workshop on Geographic information systems},
keywords = {outlier{\_}detection},
pages = {258--265},
title = {{Wavelet fuzzy classification for detecting and tracking region outliers in meteorological data}},
url = {http://dx.doi.org/10.1145/1032222.1032260},
year = {2004}
}
@article{Nairac1999,
abstract = {A system has been developed to extract diagnostic information from jet engine carcass vibration data. The system consists of a number of modules, each of which focuses on particular subsets of the data known to hold valuable information. Two of these modules, based on neural network techniques, are described in detail in this paper. In the first module, novelty detection provides a measure of how unusual the shape of a vibration signature is, by learning a representation of normality based entirely on normal examples. The low-dimensional vectors which encode vibration signatures are normalised by an appropriate transform before their distribution is modelled by a few kernels, whose placement is optimised by clustering techniques. Novelty is then measured as the local distance from the nearest kernel centre. This method provides good separation between usual and unusual vibration signatures but, given the small number of examples of normal engines, the resulting representation of normalitymay be overfitting the training data. The severity of this effect is investigated for two different normalising transforms. The second module detects sudden transitions in vibration signature curves. A multi-layer-perceptron is trained to predict one step-ahead for curves without these unexpected transitions. Sudden transitions in the test engine data are then reported whenever the prediction error exceeds a predetermined threshold.},
author = {Nairac, Alexandre and Townsend, Neil and Carr, Roy and King, Steve and Cowley, Peter and Tarassenko, Lionel},
isbn = {1069-2509},
journal = {Integrated Computer-Aided Engineering},
number = {1},
pages = {53--66},
title = {{A system for the analysis of jet engine vibration data}},
url = {http://iospress.metapress.com/content/YWAUBUE89WBW8X9L},
volume = {6},
year = {1999}
}
@phdthesis{Cheboli2010,
abstract = {University of Minnesota M.S. thesis. May 2010. Major: Computer Science. Advisor: Prof.Vipin Kumar. 1 computer science (PDF); vi, 75 pages. Ill. (some col.)},
author = {Cheboli, Deepthi},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Anomaly Detection of Time Series.pdf:pdf;:C$\backslash$:/Users/Majid/AppData/Local/Mendeley Ltd/Mendeley Desktop/Downloaded/Cheboli - 2010 - Anomaly Detection of Time Series.pdf:pdf},
school = {The University of Minnesota},
title = {{Anomaly Detection of Time Series}},
url = {http://udc.umn.edu/handle/11299/92985},
year = {2010}
}
@article{Lane1997,
abstract = {The anomaly detection problem has been widely studied in the computer security literature. In this paper we present a machine learning approach to anomaly detection. Our system builds user profiles based on command sequences and compares current input sequences to the profile using a similarity measure. The system must learn to classify current behavior as consistent or anomalous with past behavior using only positive examples of the account's valid user. Our empirical results demonstrate that this is a promising approach to distinguishing the legitimate user from an intruder.},
author = {Lane, Terran and Brodley, Carla E},
journal = {Computer Engineering},
keywords = {1 317 494 0635,1 317 494 6440,anomaly detection,computer security,ecn,edu,email,fax,machine learning,phone,purdue,terran},
pages = {366--377},
title = {{An Application of Machine Learning to Anomaly Detection}},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.112.9537{\&}rep=rep1{\&}type=pdf},
year = {1997}
}
@article{Gupta2014,
author = {Gupta, Manish and Gao, Jing and Aggarwal, Charu C and Han, Jiawei},
file = {:C$\backslash$:/Users/Majid/Dropbox/ad/rsrc/Outlier Detection for Temporal Data A Survey (Majid d's conflicted copy).pdf:pdf},
number = {9},
pages = {2250--2267},
title = {{Outlier Detection for Temporal Data : A Survey}},
volume = {26},
year = {2014}
}
@article{Boulanger-Lewandowski2012,
abstract = {We investigate the problem of modeling symbolic sequences of polyphonic music in a completely general piano-roll representation. We introduce a probabilistic model based on distribution estimators conditioned on a recurrent neural network that is able to discover temporal dependencies in high-dimensional sequences. Our approach outperforms many traditional models of polyphonic music on a variety of realistic datasets. We show how our musical language model can serve as a symbolic prior to improve the accuracy of polyphonic transcription.},
archivePrefix = {arXiv},
arxivId = {1206.6392},
author = {Boulanger-Lewandowski, Nicolas and Vincent, Pascal and Bengio, Yoshua},
eprint = {1206.6392},
isbn = {978-1-4503-1285-1},
journal = {Proceedings of the 29th International Conference on Machine Learning (ICML-12)},
number = {Cd},
pages = {1159--1166},
title = {{Modeling Temporal Dependencies in High-Dimensional Sequences: Application to Polyphonic Music Generation and Transcription}},
year = {2012}
}
@incollection{gupta2012community,
author = {Gupta, Manish and Gao, Jing and Sun, Yizhou and Han, Jiawei},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
pages = {692--708},
publisher = {Springer},
title = {{Community trend outlier detection using soft temporal pattern mining}},
year = {2012}
}
@article{Hinneburg2000,
abstract = {Nearest neighbor search in high dimensional spaces is an interesting and important problem which is relevant for a wide variety of novel database applications. As recent results show, however, the problem is a very difficult one, not only with regards to the performance issue but also to the quality issue. In this paper, we discuss the quality issue and identify a new generalized notion of nearest neighbor search as the relevant problem in high dimensional space. In contrast to previous approaches, our new notion of nearest neighbor search does not treat all dimensions equally but uses a quality criterion to select relevant dimensions (projections) with respect to the given query. As an example for a useful quality criterion, we rate how well the data is clustered around the query point within the selected projection. We then propose an efficient and effective algorithm to solve the generalized nearest neighbor problem. Our experiments based on a number of real and synthetic data sets show that our new approach provides new insights into the nature of nearest neighbor search on high dimensional data.},
author = {Hinneburg, Alexander and Aggarwal, Charu C and Keim, Daniel a},
doi = {10.1.1.2.238},
isbn = {1-55860-715-3},
journal = {Proceedings of the 26th VLDB Conference},
pages = {506--515},
title = {{What is the Nearest Neighbor in High Dimensional Spaces?}},
url = {http://www.informatik.uni-halle.de/{~}hinnebur/PS{\_}Files/vldb2000{\_}hd{\_}similarity.pdf},
year = {2000}
}
@article{Chiu2003,
abstract = {Several important time series data mining problems reduce to the core task of finding approximately repeated subsequences in a longer time series. In an earlier work, we formalized the idea of approximately repeated subsequences by introducing the notion of time series motifs. Two limitations of this work were the poor scalability of the motif discovery algorithm, and the inability to discover motifs in the presence of noise. Here we address these limitations by introducing a novel algorithm inspired by recent advances in the problem of pattern discovery in biosequences. Our algorithm is probabilistic in nature, but as we show empirically and theoretically, it can find time series motifs with very high probability even in the presence of noise or dont care symbols. Not only is the algorithm fast, but it is an anytime algorithm, producing likely candidate motifs almost immediately, and gradually improving the quality of results over time.},
author = {Chiu, Bill and Keogh, Eamonn and Lonardi, Stefano},
doi = {10.1145/956750.956808},
isbn = {1581137370},
issn = {00221694},
journal = {Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining KDD 03},
keywords = {data mining,motifs,randomized algorithms,time series},
pages = {493},
title = {{Probabilistic discovery of time series motifs}},
url = {http://portal.acm.org/citation.cfm?doid=956750.956808},
volume = {304},
year = {2003}
}
@article{Muthukrishnan2004,
abstract = { One of the central tasks in managing, monitoring and mining data streams is that of identifying outliers. There is a long history of study of various outliers in statistics and databases, and a recent focus on mining outliers in data streams. Here, we adopt the notion of "deviants" from Jagadish et al. (1999) as outliers. Deviants are based on one of the most fundamental statistical concept of standard deviation (or variance). Formally, deviants are defined based on a representation sparsity metric, i.e., deviants are values whose removal from the dataset leads to an improved compressed representation of the remaining items. Thus, deviants are not global maxima/minima, but rather these are appropriate local aberrations. Deviants are known to be of great mining value in time series databases. We present first-known algorithms for identifying deviants on massive data streams. Our algorithms monitor streams using very small space (polylogarithmic in data size) and are able to quickly find deviants at any instant, as the data stream evolves over time. For all versions of this problem - uni- vs multivariate time series, optimal vs near-optimal vs heuristic solutions, offline vs streaming - our algorithms have the same framework of maintaining a hierarchical set of candidate deviants that are updated as the time series data gets progressively revealed. We show experimentally using real network traffic data (SNMP aggregate time series) as well as synthetic data that our algorithm is remarkably accurate in determining the deviants.},
author = {Muthukrishnan, S. and Shah, R. and Vitter, J.S.},
doi = {10.1109/SSDM.2004.1311192},
isbn = {0-7695-2146-0},
issn = {1099-3371},
journal = {Proceedings. 16th International Conference on Scientific and Statistical Database Management, 2004.},
pages = {41--50},
title = {{Mining deviants in time series data streams}},
url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=1311192},
year = {2004}
}