test_elasticsearch.ipynb 4.46 KB
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# based on: http://blog.tryolabs.com/2015/02/17/python-elasticsearch-first-steps/\n",
    "\n",
    "import requests\n",
    "res = requests.get('http://localhost:9200')\n",
    "print(res.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "es = Elasticsearch([{'host': 'localhost', 'port': 9200}])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import codecs\n",
    "import json\n",
    "import logging\n",
    "import sys\n",
    "from util import ngrams\n",
    "from collections import defaultdict\n",
    "from filter import city_filter\n",
    "from twitter.Tweet import Tweet\n",
    "\n",
    "# load cityNamesDict\n",
    "cityNamesDict = city_filter.normalizeCityNames()\n",
    "cities15000 = city_filter.loadCities15000(filename=\"resources/cities15000.txt\")\n",
    "print len(cities15000)\n",
    "\n",
    "# input file\n",
    "inputFile = \"../../../english-tweets-20151101.json.gz\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def wordcountPlain(tweet, onlyHashtags=False, ngram=1):\n",
    "    tweetText = tweet['text']\n",
    "    tokens = Tweet.tokenizeTweetText(tweetText)\n",
    "    \n",
    "    ngramsList = list()\n",
    "    tokenList = [t for t in tokens if (len(t) > 2 and (not ngrams.is_url_or_mention(t)))]\n",
    "    \n",
    "    if ngram > 1:\n",
    "        ngramsList = ngramsList + tokenList\n",
    "        for ng in range(1, ngram):\n",
    "            ngramsList = ngramsList + [ntoken for ntoken in ngrams.window(tokenList, ng + 1)]\n",
    "            # tokenList = [ntoken for ntoken in ngrams.window(tokenList, ng + 1)]\n",
    "        return ngramsList\n",
    "    else:\n",
    "        if onlyHashtags:\n",
    "            for token in tokenList:  # len(token) > 2\n",
    "                if token.startswith('#'):\n",
    "                    ngramsList.append(token)\n",
    "            return ngramsList\n",
    "        else:\n",
    "            return tokenList"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tweetsAsDict = Tweet.getTweetAsDictionaryFromGZ(inputFile)\n",
    "i = 0\n",
    "# try:\n",
    "for tweet in tweetsAsDict:\n",
    "    #i += 1\n",
    "    # get US city\n",
    "    city = city_filter.get_US_City(tweet, cityNamesDict, cities15000)\n",
    "    if city:\n",
    "        i += 1\n",
    "        ngramsList = wordcountPlain(tweet, True, 1)\n",
    "        tweet['city'] = city \n",
    "        tweet['ngrams'] = ngramsList\n",
    "        if len(ngramsList)>0: print i\n",
    "        es.index(index='100tweets', doc_type='tweets+', id=i, body=tweet)\n",
    "        print city, tweet['text'], ngramsList\n",
    "        if i%100==0:\n",
    "            break\n",
    "# except:\n",
    "#     print \"looser!\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "es.get(index='100tweets', doc_type='tweets+', id=14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "es.search(index=\"100tweets\", body={\"query\": {\"match\": {'text':'Thank you'}}})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "r = requests.get('http://localhost:9200')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2.0
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}