test_filter.ipynb 10.4 KB
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from filter import city_filter\n",
    "from util import twokenize, ngrams"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "US_CITIES = [\"new york\", \"san francisco\", \"los angeles\", \"chicago\", \"houston\", \"philadelphia\", \"san diego\", \"boston\",\n",
    "             \"seattle\", \"austin\"]\n",
    "US_CITIES_with_synonyms = [\"new york\", \"san francisco\", \"los angeles\", \"chicago\", \"houston\", \"philadelphia\",\n",
    "                           \"san diego\", \"boston\", \"seattle\", \"austin\", \"ny\", \"nyc\", \"la\", \"sf\", \"philly\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "city_Dict10 = city_filter.normalizeCityNames()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8491\n"
     ]
    }
   ],
   "source": [
    "cities15000 = city_filter.loadCities15000(filename=\"resources/cities15000.txt\")\n",
    "print len(cities15000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def getCityFromToken(token, city_list):\n",
    "    city = \"\"\n",
    "    if token in city_list:\n",
    "        city = token\n",
    "    return city\n",
    "\n",
    "\n",
    "def cleanLists(potentialCities):\n",
    "    \"\"\"\n",
    "\n",
    "    :param potentialCities: a set\n",
    "    :return: a list\n",
    "    \"\"\"\n",
    "    if \"\" in potentialCities:\n",
    "        potentialCities.remove(\"\")\n",
    "    return list(potentialCities)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def getUserLocationCity(locationField, cities15000, city_Dict10):\n",
    "    \"\"\"\n",
    "    THis field is an empty string\n",
    "    :param tweet:\n",
    "    :return: a list of unique elems\n",
    "\n",
    "\n",
    "    todo:\n",
    "    - there may be more cities but only one in the list - so not ok - MORE CITIES dilemma\n",
    "    - twokenizer removes punctuation but not special char\n",
    "    - NY dilemma: \"New York, USA\" vs. \"New York, NY USA\"\n",
    "    \"\"\"\n",
    "\n",
    "    potentialCities = set()\n",
    "    world_city_set = set()\n",
    "\n",
    "    # 0 Pre-tests\n",
    "    # special check ny - as state\n",
    "    if \", ny\" in locationField.lower():\n",
    "        if \"new york, ny\" not in locationField.lower():  # con o senza virgola\n",
    "            return []\n",
    "\n",
    "    if \", la\" in locationField.lower():\n",
    "        return []\n",
    "\n",
    "\n",
    "\n",
    "    # 1. split by / - the only char that is not in the tokeniker!\n",
    "    if \"/\" in locationField:\n",
    "        locArray = locationField.split(\"/\")\n",
    "        for token in locArray:\n",
    "            city = getCityFromToken(token.strip().lower(), US_CITIES_with_synonyms)\n",
    "            world_city = getCityFromToken(token.strip().lower(), cities15000)\n",
    "            if city:\n",
    "                potentialCities.add(city_Dict10[city])\n",
    "            if world_city:\n",
    "                world_city_set.add(world_city)\n",
    "\n",
    "    # 2. tokenize with util and get unigrams, bigrams and trigrams - to lower\n",
    "    # unigrams\n",
    "    tokenList = twokenize.tokenize(locationField.lower())\n",
    "    tokens = ngrams.window_no_twitter_elems(tokenList, 1)\n",
    "    for token in tokens:\n",
    "        city = getCityFromToken(token.strip().lower(), US_CITIES_with_synonyms)\n",
    "        world_city = getCityFromToken(token.strip().lower(), cities15000)\n",
    "        if city:\n",
    "            potentialCities.add(city_Dict10[city])\n",
    "        if world_city:\n",
    "                world_city_set.add(world_city)\n",
    "\n",
    "    # bigrams\n",
    "    tokens = ngrams.window_no_twitter_elems(tokenList, 2)\n",
    "    for token in tokens:\n",
    "        city = getCityFromToken(token.strip().lower(), US_CITIES_with_synonyms)\n",
    "        world_city = getCityFromToken(token.strip().lower(), cities15000)\n",
    "        if city:\n",
    "            potentialCities.add(city_Dict10[city])\n",
    "            world_city_set.add(city)\n",
    "\n",
    "    # trigrams\n",
    "    tokens = ngrams.window_no_twitter_elems(tokenList, 3)\n",
    "    for token in tokens:\n",
    "        city = getCityFromToken(token.strip().lower(), US_CITIES_with_synonyms)\n",
    "        world_city = getCityFromToken(token.strip().lower(), cities15000)\n",
    "        if city:\n",
    "            potentialCities.add(city_Dict10[city])\n",
    "        if world_city:\n",
    "                world_city_set.add(world_city)\n",
    "\n",
    "    world_city_clean = cleanLists(world_city_set)\n",
    "    c = cleanLists(potentialCities)\n",
    "    \n",
    "    print \"WC\", world_city_clean, \"C\", c\n",
    "    \n",
    "    # 3  more cities in the user location\n",
    "    \n",
    "    if len(world_city_clean) == 0 and len(c) != 0:\n",
    "        return c\n",
    "    \n",
    "    if len(world_city_clean) != len(c):\n",
    "        return []\n",
    "    else:\n",
    "        if len(set(world_city_clean).intersection(set(c)))==0:\n",
    "            return []\n",
    "\n",
    "    return c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WC ['seattle'] C ['seattle']\n",
      "['seattle']\n",
      "WC [u'seattle'] C ['seattle']\n",
      "['seattle']\n",
      "WC [u'seattle'] C ['seattle']\n",
      "['seattle']\n"
     ]
    }
   ],
   "source": [
    "# print getUserLocationCity(\"Seattle, Wa.\", cities15000, city_Dict10) \n",
    "# print getUserLocationCity(\"San Diego☀️\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"Houston, Tx.\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"Los Angeles. CA\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"SAN DIEGO\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"San Diego | San Francisco\", cities15000, city_Dict10) \n",
    "# print getUserLocationCity(\"Manhattan, NY\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"Los Angeles, CA\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"New York, USA\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"New York, NY USA\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"Austin, TX\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"Chico, CA\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"San Diego | Paris\", cities15000, city_Dict10) # == [\"san diego\"]\n",
    "\n",
    "\n",
    "# print getUserLocationCity(\"Philly, Jersey, NYC\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"NYC | LA\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"MS, AL, AR, LA, TX, TN, & FL\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"Brighton/LA\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"New Orleans, LA\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"LA/Manila/Kyoto-Sakyo\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"NY.MIAMI.LA\", cities15000, city_Dict10)\n",
    "\n",
    "\n",
    "# print getUserLocationCity(\"south philly / rhode island\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"philly//asheville\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"otra philly :(\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"metlife//philly\", cities15000, city_Dict10)\n",
    "\n",
    "\n",
    "### how to fix this\n",
    "# print getUserLocationCity(\"austin and justin follows %\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"austin, tx ✈️ splendora, tx\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"austin, texas\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"Cameron,justin,austin &2/5jano\", cities15000, city_Dict10)\n",
    "\n",
    "# print getUserLocationCity(\"chicago/dar es salaam\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"bridgeport chicago\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"hopefully chicago soon ✈️\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"chicago, il\", cities15000, city_Dict10)\n",
    "\n",
    "### problem1!!\n",
    "# print getUserLocationCity(\"houston ms\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"houston tx\", cities15000, city_Dict10)\n",
    "\n",
    "# print getUserLocationCity(\"boston - toronto\", cities15000, city_Dict10)\n",
    "# print getUserLocationCity(\"boston | 1989 7/24 & 10/31\", cities15000, city_Dict10)\n",
    "\n",
    "print getUserLocationCity(\"otra//seattle\", cities15000, city_Dict10)\n",
    "print getUserLocationCity(\"otra seattle // 7.15.15\", cities15000, city_Dict10)\n",
    "print getUserLocationCity(\"seattle | she/her\", cities15000, city_Dict10)\n",
    "\n",
    "### sometimes there is otra! otra philly , otra seattle etc? - check out those"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2.0
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}