{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import findspark\n",
    "findspark.init()\n",
    "import pyspark\n",
    "from pyspark import SparkConf\n",
    "from pyspark.sql import SparkSession\n",
    "\n",
    "if 'spark' in vars() and spark:\n",
    "    spark.stop()\n",
    "    spark = None\n",
    "    \n",
    "conf = SparkConf()\n",
    "#conf.setMaster('yarn')\n",
    "conf.setMaster('local[8]')\n",
    "conf.setAppName('testing anon load')\n",
    "\n",
    "\n",
    "spark = SparkSession.builder.config(conf=conf).getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 3796\r\n",
      "-rw-r--r--. 1 eco eco 132413 Jun 29 10:40 \u001b[0m\u001b[38;5;9mpart-00010-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n",
      "-rw-r--r--. 1 eco eco 272767 Jun 29 10:40 \u001b[38;5;9mpart-00009-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n",
      "-rw-r--r--. 1 eco eco 272314 Jun 29 10:40 \u001b[38;5;9mpart-00008-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n",
      "-rw-r--r--. 1 eco eco 277158 Jun 29 10:40 \u001b[38;5;9mpart-00007-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n",
      "-rw-r--r--. 1 eco eco 321451 Jun 29 10:40 \u001b[38;5;9mpart-00006-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n",
      "-rw-r--r--. 1 eco eco 331419 Jun 29 10:40 \u001b[38;5;9mpart-00005-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n",
      "-rw-r--r--. 1 eco eco 337195 Jun 29 10:40 \u001b[38;5;9mpart-00004-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n",
      "-rw-r--r--. 1 eco eco 366346 Jun 29 10:40 \u001b[38;5;9mpart-00003-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n",
      "-rw-r--r--. 1 eco eco 423154 Jun 29 10:40 \u001b[38;5;9mpart-00002-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n",
      "-rw-r--r--. 1 eco eco 458187 Jun 29 10:40 \u001b[38;5;9mpart-00000-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n",
      "-rw-r--r--. 1 eco eco      0 Jun 29 10:40 _SUCCESS\r\n",
      "-rw-r--r--. 1 eco eco 673836 Jun 29 10:40 \u001b[38;5;9mpart-00001-2558524a-1a1f-4f14-b027-4276a8143194-c000.json.gz\u001b[0m\r\n"
     ]
    }
   ],
   "source": [
    "%ls -ltr '/data/small-anon/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.42 ms, sys: 2.15 ms, total: 6.57 ms\n",
      "Wall time: 6.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "dp = spark.read.json('/data/small-anon/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "StructType(List(StructField(created,LongType,true),StructField(id,StringType,true),StructField(properties,StructType(List(StructField(WANAccessType,StringType,true),StructField(arrisNvgDbCheck,StringType,true),StructField(deviceClassifiers,ArrayType(StringType,true),true),StructField(deviceType,StringType,true),StructField(firstInform,StringType,true),StructField(groups,ArrayType(StringType,true),true),StructField(hardwareVersion,StringType,true),StructField(hncEnable,StringType,true),StructField(lastBoot,StringType,true),StructField(lastInform,StringType,true),StructField(lastPeriodic,StringType,true),StructField(manufacturerName,StringType,true),StructField(modelName,StringType,true),StructField(productClass,StringType,true),StructField(protocolVersion,StringType,true),StructField(provisioningCode,StringType,true),StructField(softwareVersion,StringType,true),StructField(tags,ArrayType(StringType,true),true),StructField(timeZone,StringType,true),StructField(wan,StructType(List(StructField(ethDuplexMode,StringType,true),StructField(ethSyncBitRate,StringType,true))),true),StructField(wifi,ArrayType(StructType(List(StructField(0,StructType(List(StructField(Enable,StringType,true),StructField(SSID,StringType,true),StructField(SSIDAdvertisementEnabled,StringType,true))),true),StructField(1,StructType(List(StructField(Enable,StringType,true),StructField(SSID,StringType,true),StructField(SSIDAdvertisementEnabled,StringType,true))),true),StructField(2,StructType(List(StructField(Enable,StringType,true),StructField(SSID,StringType,true),StructField(SSIDAdvertisementEnabled,StringType,true))),true),StructField(3,StructType(List(StructField(Enable,StringType,true),StructField(SSID,StringType,true),StructField(SSIDAdvertisementEnabled,StringType,true))),true),StructField(4,StructType(List(StructField(Enable,StringType,true),StructField(SSID,StringType,true),StructField(SSIDAdvertisementEnabled,StringType,true))),true),StructField(5,StructType(List(StructField(Enable,StringType,true),StructField(SSID,StringType,true),StructField(SSIDAdvertisementEnabled,StringType,true))),true),StructField(6,StructType(List(StructField(Enable,StringType,true),StructField(SSID,StringType,true),StructField(SSIDAdvertisementEnabled,StringType,true))),true),StructField(7,StructType(List(StructField(Enable,StringType,true),StructField(SSID,StringType,true),StructField(SSIDAdvertisementEnabled,StringType,true))),true))),true),true))),true),StructField(ts,LongType,true)))"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dp.schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.2 ms, sys: 972 µs, total: 3.17 ms\n",
      "Wall time: 1.39 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "33447"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "dp.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(created=1562609946950, id='df9bd7a5a1ac11e9acf100505699063c', properties=Row(WANAccessType='1.5', arrisNvgDbCheck=None, deviceClassifiers=['ARRIS HNC IGD', 'Annex F Gateway', '001E46.NVG443B.Is.WANIP', 'Supports.Collect.Optimized.Workflow', 'Device.Supports.FastVDSLProxy', 'Supports.TR98.Traceroute', 'InternetGatewayDevice:1.4', 'Motorola.ServiceType.IP', 'Device.Supports.DSL.Stats', 'Wireless.Common.IGD.DualRadio', 'Arris.NVG44x.9.2.4', 'Device.Supports.HNC', 'Device.Type.RG', 'Arris.NVG4xx.Missing.CA', 'Supports.TR98.IPPing', 'Device.Supports.DSL.Basic', 'Wireless', 'Arris.NVG.Wireless', 'WLAN.Radios.Action.Common.TR098', 'ConnecticutDeviceTypes', 'Device.Supports.SpeedTest', 'Arris.NVG4xx.9.2.4h1d28', '001E46.NVG443B.Is.WANDsl', 'Motorola.device', 'Device.Supports.DSL.Filter', 'Arris.NVG4xx', 'CaptivePortal:1', 'All.TR069.RG.Devices', 'TraceRoute:1', 'All.Devices', 'Arris.NVG44x.9.2.4+', 'Arris.NVG443B', 'datamodel.igd', 'IPPing:1', 'Device.ServiceType.IP', 'Arris.NVG4xx.9.2.4', 'broken.device.no.notification'], deviceType='IGD', firstInform='1562609974427', groups=['Total Control', 'Self-Service Diagnostics', 'HSI', 'SLF-SRVC_DGNSTCS000', 'NVG44x Run 11', 'XDSL_18M_1M', 'HS005', 'XDSL_18M_1M000', 'TTL_CNTRL000', 'TCW - NVG4xx - First Contact', 'TCW Enabled'], hardwareVersion='NVG443B_0200210031004B', hncEnable='1', lastBoot='1590607587605', lastInform='1590623991014', lastPeriodic='1590623991014', manufacturerName='Motorola', modelName='NVG443B', productClass='NVG443B', protocolVersion='cwmp10', provisioningCode='', softwareVersion='9.2.4h1d28', tags=['default'], timeZone='EST+5EDT', wan=None, wifi=[Row(0=Row(Enable='1', SSID='Frontier7936', SSIDAdvertisementEnabled='1'), 1=Row(Enable='0', SSID='Guest7936', SSIDAdvertisementEnabled='1'), 2=Row(Enable='0', SSID='Frontier7936_D2', SSIDAdvertisementEnabled='1'), 3=Row(Enable='0', SSID='Frontier7936_D3', SSIDAdvertisementEnabled='1'), 4=Row(Enable='1', SSID='Frontier7936', SSIDAdvertisementEnabled='1'), 5=Row(Enable='0', SSID='Guest7936_5G', SSIDAdvertisementEnabled='1'), 6=Row(Enable='0', SSID='', SSIDAdvertisementEnabled='1'), 7=Row(Enable='0', SSID='Frontier7936_5G_D2', SSIDAdvertisementEnabled='1'))]), ts=1590623991014),\n",
       " Row(created=1517876222268, id='2e9a7bbf87526d9dedfcbb9eaaeb67b5', properties=Row(WANAccessType='2', arrisNvgDbCheck='1:success', deviceClassifiers=['ARRIS HNC IGD', 'Annex F Gateway', 'Supports.Collect.Optimized.Workflow', 'Fast.Inform', 'InternetGatewayDevice:1.4', 'Supports.TR98.Traceroute', 'Supports Arris FastPath Speed Test', 'Motorola.ServiceType.IP', 'Arris.NVG468MQ.9.3.0h0', 'Wireless.Common.IGD.DualRadio', '001E46.NVG468MQ.Is.WANIP', 'Device.Supports.HNC', 'Device.Type.RG', 'Supports.TR98.IPPing', 'Arris.NVG4xx.Missing.CA', 'Arris.NVG468MQ.9.3.0+', 'Wireless', 'ARRIS HNC IGD EUROPA', 'Arris.NVG.Wireless', 'VoiceService:1.0', 'WLAN.Radios.Action.Common.TR098', 'ConnecticutDeviceTypes', 'Device.Supports.SpeedTest', 'Motorola.Device.Supports.VoIP', 'Arris.NVG468MQ', 'Motorola.device', 'CaptivePortal:1', 'Arris.NVG4xx', 'All.TR069.RG.Devices', 'TraceRoute:1', 'Arris.NVG4xx.9.3.0+', 'datamodel.igd', 'Arris.NVG4xxQ', 'IPPing:1', 'Device.ServiceType.IP', '001E46.NVG468MQ.Is.WANEth', 'Arris.NVG468MQ.9.2.4+', 'broken.device.no.notification'], deviceType='IGD', firstInform='1517876240261', groups=['Total Control', 'Self-Service Diagnostics', 'HSI', 'SLF-SRVC_DGNSTCS000', 'HS004', 'GPN_150M_150M001', 'TTL_CNTRL000', 'GPON_150M_150M'], hardwareVersion='NVG468MQ_0200240031004E', hncEnable='0', lastBoot='1590217806220', lastInform='1590623997346', lastPeriodic='1590623997346', manufacturerName='Motorola', modelName='NVG468MQ', productClass='NVG468MQ', protocolVersion='cwmp10', provisioningCode='', softwareVersion='9.3.0h0d55', tags=['default'], timeZone='EST+5EDT,M3.2.0/2,M11.1.0/2', wan=Row(ethDuplexMode='Full', ethSyncBitRate='1000'), wifi=[Row(0=Row(Enable='1', SSID='Frontier8560', SSIDAdvertisementEnabled='1'), 1=Row(Enable='0', SSID='Guest8560', SSIDAdvertisementEnabled='1'), 2=Row(Enable='0', SSID='Frontier8560_D2', SSIDAdvertisementEnabled='1'), 3=Row(Enable='0', SSID='Frontier8560_D3', SSIDAdvertisementEnabled='1'), 4=Row(Enable='1', SSID='Frontier8560_5G', SSIDAdvertisementEnabled='1'), 5=Row(Enable='0', SSID='Guest8560_5G', SSIDAdvertisementEnabled='1'), 6=Row(Enable='1', SSID='Frontier8560_5G-TV', SSIDAdvertisementEnabled='0'), 7=Row(Enable='0', SSID='Frontier8560_5G_D2', SSIDAdvertisementEnabled='1'))]), ts=1590623997346),\n",
       " Row(created=1454955570705, id='4c39b8a4f7531e8aa811f614cf99bf1d', properties=Row(WANAccessType='1', arrisNvgDbCheck=None, deviceClassifiers=['InternetGatewayDevice:1.1', 'Netgear.D2200D', 'Netgear.device', 'Wireless.Common.IGD.SingleRadio', 'Fast.Inform', 'Supports.TR98.Traceroute', 'Device.Supports.DSL.Filter', 'CaptivePortal:1', 'All.TR069.RG.Devices', 'Netgear.device.Wireless', 'TraceRoute:1', 'All.Devices', 'datamodel.igd', 'Device.Type.RG', 'Supports.TR98.IPPing', 'Netgear.D2200D.VER_01.00.65', 'IPPing:1', 'Device.Supports.DSL.Basic', 'FTRDSLDataCollectGrp1', '00600F.D2200D-1FRNAS.Is.WANDsl', '00600F.D2200D-1FRNAS.Is.WANPPP', 'WLAN.Radios.Action.Common.TR098', 'ConnecticutDeviceTypes'], deviceType='IGD', firstInform='1454955567458', groups=['HSI', 'HS002', 'PPP', 'PPP004'], hardwareVersion='D2200D-1FRNASA', hncEnable=None, lastBoot='1590614679912', lastInform='1590623996525', lastPeriodic='1590623996525', manufacturerName='Netgear', modelName='D2200D-1FRNAS', productClass='D2200D-1FRNAS', protocolVersion='cwmp10', provisioningCode='', softwareVersion='VER_01.00.65', tags=['default'], timeZone='EST5EDT,M3.2.0/02:00:00,M11.1.0/02:00:00', wan=None, wifi=[Row(0=Row(Enable='1', SSID='Frontier2EOA', SSIDAdvertisementEnabled='1'), 1=Row(Enable='0', SSID='Guest2E0A', SSIDAdvertisementEnabled='1'), 2=Row(Enable='0', SSID='ThirdNETGEAR', SSIDAdvertisementEnabled='1'), 3=Row(Enable='0', SSID='FourthNETGEAR', SSIDAdvertisementEnabled='1'), 4=None, 5=None, 6=None, 7=None)]), ts=1590623996525)]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dp.take(3)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
