Mis primeros pasos con MongoDB

Para ser honesto yo ya había instalado MongoDB hace años, pero no es sino hasta ahora que me ha tocado un proyecto en el cual creo que le puedo sacar mucho provecho. En vista de esto, me senté a jugar un poco con la herramienta.

La Internet cuenta con muchos lugares de donde podemos bajar juegos de datos, en este caso me decidí bajarme la lista de los nombres de bebes más populares en el estado de Nueva York entre los años 2011 y 2014, en formato JSON.

{
  "meta" : {
    "view" : {
      "id" : "25th-nujf",
      "name" : "Most Popular Baby Names by Sex and Mother's Ethnic Group, New York City",
      "attribution" : "Department of Health and Mental Hygiene (DOHMH)",
      "averageRating" : 0,
      "category" : "Health",
      "createdAt" : 1382724894,
      "description" : "The most popular baby names by sex and mother's ethnicity in New York City.",
      "displayType" : "table",
      "downloadCount" : 4328,
      "hideFromCatalog" : false,
      "hideFromDataJson" : false,
      "indexUpdatedAt" : 1465427458,
      "newBackend" : false,
...
  },
  "data" : [ [ 1, "EB6FAA1B-EE35-4D55-B07B-8E663565CCDF", 1, 1386853125, "399231", 1386853125, "399231", "{\n}", "2011", "FEMALE", "HISPANIC", "GERALDINE", "13", "75" ]
, [ 2, "2DBBA431-D26F-40A1-9375-AF7C16FF2987", 2, 1386853125, "399231", 1386853125, "399231", "{\n}", "2011", "FEMALE", "HISPANIC", "GIA", "21", "67" ]
, [ 3, "54318692-0577-4B21-80C8-9CAEFCEDA8BA", 3, 1386853125, "399231", 1386853125, "399231", "{\n}", "2011", "FEMALE", "HISPANIC", "GIANNA", "49", "42" ]
, [ 4, "17C1236A-5778-412D-8DC9-94EBC01BB9A1", 4, 1386853125, "399231", 1386853125, "399231", "{\n}", "2011", "FEMALE", "HISPANIC", "GISELLE", "38", "51" ]
, [ 5, "F53CF696-A3F4-4EC3-8DFD-10C0A111B2D8", 5, 1386853125, "399231", 1386853125, "399231", "{\n}", "2011", "FEMALE", "HISPANIC", "GRACE", "36", "53" ]
, [ 6, "6615893F-39B8-440C-98D3-5A37CCF1C44B", 6, 1386853125, "399231", 1386853125, "399231", "{\n}", "2011", "FEMALE", "HISPANIC", "GUADALUPE", "26", "62" ]
, [ 7, "CC9BE461-34B8-4BD7-BEF2-BDB23CA1ADC6", 7, 1386853125, "399231", 1386853125, "399231", "{\n}", "2011", "FEMALE", "HISPANIC", "HAILEY", "126", "8" ]
, [ 8, "4EA2FFD4-1B1D-4859-A5C2-045949E3FD36", 8, 1386853125, "399231", 1386853125, "399231", "{\n}", "2011", "FEMALE", "HISPANIC", "HALEY", "14", "74" ]
]}

Antes de importar el archivo en Mongo, hay que darle un masaje a los datos, para ignorar la metadata y para separar cada entrada en el archivo para que luzca como un documento individual, sobre el cual podremos hacer agregación.

Primero les muestro como arrancar MongoDB, importar los datos (no muestro como bajarse el archivo pero es trivial) y luego corremos unos cuantos consultas:

#!/bin/bash
export PATH=$PATH:/Users/josevnz/mongo/mongodb-osx-x86_64-3.2.10/bin
mongod --fork --logpath /Users/josevnz/mongo/mongod.log --noauth --rest --pidfilepath /Users/josevnz/mongo/mongod.pid --dbpath /Users/josevnz/mongo/data

El programa para masajear los datos, en Python:

#!/usr/bin/env python
# josevn at kodegeek.com
import json
import sys
from pprint import pprint
if len(sys.argv) < 2:
  raise ValueError("Missing JSON file...")

'''
// From this (array of arrays):
{
    "data": [
        [
            1,
            "EB6FAA1B-EE35-4D55-B07B-8E663565CCDF",
            1,
            1386853125,
            "399231",
            1386853125,
            "399231",
            "{\n}",
            "2011",
            "FEMALE",
            "HISPANIC",
            "GERALDINE",
            "13",
            "75"
        ],
        [
          ...
        ]
    ],
    "metadata" : {
...
    }
}
 // To this individual maps, no metadata and no extra attributes:
      {
       "key": 1,
       "id": "EB6FAA1B-EE35-4D55-B07B-8E663565CCDF",
       "year": "2011",
       "gender": "FEMALE",
       "ethnicity": "HISPANIC",
       "name": "GERALDINE",
       "cnt": 13
      },
      {
       ...
      }
'''
with open(sys.argv[1]) as data_file:
  data = json.load(data_file)
  for item in data["data"]:
    newitem = {}
    newitem['key'] = item[0]
    newitem['id'] = item[1]
    newitem['year'] = int(item[8])
    newitem['gender'] = item[9]
    newitem['ethnicity'] = item[10]
    newitem['name'] = item[11]
    newitem['cnt'] = int(item[12])
    print(json.dumps(newitem))

Luego importamos los datos y nos conectamos listos para correr un par de consultas:

./jsonMap.py ~/Downloads/NY.babynames.json > ~/Downloads/NY.babynames.pretty.json
mongoimport --db ny --collection babynames --file ~/Downloads/NY.babynames.pretty.json --upsert
mongo localhost:27017/ny

Me encanta que no tuve que crear una base de datos, o una tabla en SQL :-). El documento se describe sólo y ahora estoy listo para hacer consultas:

// ¿Cuantos bebes hispanos entre el 2011 y el 2014?
> db.babynames.find({ "ethnicity": "HISPANIC"}).count()
4254
// Muestre los total agrupando por raza, sexo y año de mayor  a menor
> db.babynames.aggregate([ { $group: { _id: { ethnicity: "$ethnicity", gender: "$gender", year: "$year" }, total: { $sum: "$cnt"} } }, {$sort: {total:-1}} ])
{ "_id" : { "ethnicity" : "HISPANIC", "gender" : "MALE", "year" : 2011 }, "total" : 56236 }
{ "_id" : { "ethnicity" : "WHITE NON HISPANIC", "gender" : "MALE", "year" : 2011 }, "total" : 54392 }
{ "_id" : { "ethnicity" : "WHITE NON HISPANIC", "gender" : "FEMALE", "year" : 2011 }, "total" : 45120 }
{ "_id" : { "ethnicity" : "HISPANIC", "gender" : "FEMALE", "year" : 2011 }, "total" : 41532 }
{ "_id" : { "ethnicity" : "BLACK NON HISPANIC", "gender" : "MALE", "year" : 2011 }, "total" : 24540 }
{ "_id" : { "ethnicity" : "ASIAN AND PACIFIC ISLANDER", "gender" : "MALE", "year" : 2011 }, "total" : 18580 }
{ "_id" : { "ethnicity" : "BLACK NON HISPANIC", "gender" : "FEMALE", "year" : 2011 }, "total" : 17624 }
{ "_id" : { "ethnicity" : "WHITE NON HISPANIC", "gender" : "MALE", "year" : 2014 }, "total" : 14831 }
{ "_id" : { "ethnicity" : "WHITE NON HISPANIC", "gender" : "MALE", "year" : 2013 }, "total" : 14537 }
{ "_id" : { "ethnicity" : "WHITE NON HISP", "gender" : "MALE", "year" : 2012 }, "total" : 14273 }
{ "_id" : { "ethnicity" : "HISPANIC", "gender" : "MALE", "year" : 2012 }, "total" : 13809 }
{ "_id" : { "ethnicity" : "ASIAN AND PACIFIC ISLANDER", "gender" : "FEMALE", "year" : 2011 }, "total" : 13672 }
{ "_id" : { "ethnicity" : "HISPANIC", "gender" : "MALE", "year" : 2013 }, "total" : 13312 }
{ "_id" : { "ethnicity" : "HISPANIC", "gender" : "MALE", "year" : 2014 }, "total" : 13126 }
{ "_id" : { "ethnicity" : "WHITE NON HISPANIC", "gender" : "FEMALE", "year" : 2014 }, "total" : 12884 }
{ "_id" : { "ethnicity" : "WHITE NON HISP", "gender" : "FEMALE", "year" : 2012 }, "total" : 12402 }
{ "_id" : { "ethnicity" : "WHITE NON HISPANIC", "gender" : "FEMALE", "year" : 2013 }, "total" : 12303 }
{ "_id" : { "ethnicity" : "HISPANIC", "gender" : "FEMALE", "year" : 2013 }, "total" : 9755 }
{ "_id" : { "ethnicity" : "HISPANIC", "gender" : "FEMALE", "year" : 2012 }, "total" : 9738 }
{ "_id" : { "ethnicity" : "HISPANIC", "gender" : "FEMALE", "year" : 2014 }, "total" : 9729 }
Type "it" for more
> it
{ "_id" : { "ethnicity" : "BLACK NON HISP", "gender" : "MALE", "year" : 2012 }, "total" : 5965 }
{ "_id" : { "ethnicity" : "ASIAN AND PACI", "gender" : "MALE", "year" : 2012 }, "total" : 5962 }
{ "_id" : { "ethnicity" : "BLACK NON HISPANIC", "gender" : "MALE", "year" : 2013 }, "total" : 5866 }
{ "_id" : { "ethnicity" : "BLACK NON HISPANIC", "gender" : "MALE", "year" : 2014 }, "total" : 5702 }
{ "_id" : { "ethnicity" : "ASIAN AND PACIFIC ISLANDER", "gender" : "MALE", "year" : 2014 }, "total" : 5636 }
{ "_id" : { "ethnicity" : "ASIAN AND PACIFIC ISLANDER", "gender" : "MALE", "year" : 2013 }, "total" : 5281 }
{ "_id" : { "ethnicity" : "ASIAN AND PACI", "gender" : "FEMALE", "year" : 2012 }, "total" : 4338 }
{ "_id" : { "ethnicity" : "BLACK NON HISPANIC", "gender" : "FEMALE", "year" : 2013 }, "total" : 4278 }
{ "_id" : { "ethnicity" : "BLACK NON HISPANIC", "gender" : "FEMALE", "year" : 2014 }, "total" : 4255 }
{ "_id" : { "ethnicity" : "BLACK NON HISP", "gender" : "FEMALE", "year" : 2012 }, "total" : 4243 }
{ "_id" : { "ethnicity" : "ASIAN AND PACIFIC ISLANDER", "gender" : "FEMALE", "year" : 2014 }, "total" : 4198 }
{ "_id" : { "ethnicity" : "ASIAN AND PACIFIC ISLANDER", "gender" : "FEMALE", "year" : 2013 }, "total" : 4012 }

Estoy emocionado con la herramienta, pienso usarla para ver los datos en mi trabajo de manera distinta.