Here's a slightly better version and a python script. -ml -- put this in <file> and run using 'cqlsh -f <file>
DROP KEYSPACE latest; CREATE KEYSPACE latest WITH replication = { 'class': 'SimpleStrategy', 'replication_factor' : 1 }; USE latest; CREATE TABLE time_series ( bucket_userid text, -- bucket is the beginning of a datetime span concatenated with a shard designator user_id text, pkid text, timeuuid text, colname text, PRIMARY KEY (bucket_userid, timeuuid) ); UPDATE time_series SET user_id = 'XYZ', pkid = '1000', colname = 'Col-Name-1' WHERE bucket_userid = '2013-09-11T05:15-0_XYZ' AND timeuuid='200' ; UPDATE time_series SET user_id = 'XYZ', pkid = '1001', colname = 'Col-Name-2' WHERE bucket_userid = '2013-09-11T05:15-1_XYZ' AND timeuuid='201' ; UPDATE time_series SET user_id = 'XYZ', pkid = '1000', colname = 'Col-Name-3' WHERE bucket_userid = '2013-09-11T05:15-0_XYZ' AND timeuuid='202' ; UPDATE time_series SET user_id = 'XYZ', pkid = '1000', colname = 'Col-Name-4' WHERE bucket_userid = '2013-09-11T05:30-1_XYZ' AND timeuuid='203' ; UPDATE time_series SET user_id = 'XYZ', pkid = '1002', colname = 'Col-Name-5' WHERE bucket_userid = '2013-09-11T05:30-0_XYZ' AND timeuuid='204' ; -- This query assumes that the 'current' span is 2013-09-11T05:30 and I am interested in this span and the previous one. SELECT * FROM time_series WHERE bucket_userid IN ( -- go back as many spans as you need to, all shards in each span (cartesian product) '2013-09-11T05:15-0_XYZ', '2013-09-11T05:15-1_XYZ', '2013-09-11T05:30-0_XYZ', '2013-09-11T05:30-1_XYZ' ) -- you could add a range condition on timeuuid to further restrict the results ORDER BY timeuuid DESC; -- returns: -- bucket_userid | timeuuid | colname | pkid | user_id --------------------------+----------+------------+------+--------- -- 2013-09-11T05:30-0_XYZ | 204 | Col-Name-5 | 1002 | XYZ -- 2013-09-11T05:30-1_XYZ | 203 | Col-Name-4 | 1000 | XYZ -- 2013-09-11T05:15-0_XYZ | 202 | Col-Name-3 | 1000 | XYZ -- 2013-09-11T05:15-1_XYZ | 201 | Col-Name-2 | 1001 | XYZ -- 2013-09-11T05:15-0_XYZ | 200 | Col-Name-1 | 1000 | XYZ -- do a stable purge on pkid to get the result python script: #!/usr/bin/env python # -*- coding: utf-8 -*- import logging logging.basicConfig() import cassandra from cassandra.cluster import Cluster from cassandra.query import SimpleStatement cql_cluster = Cluster() cql_session = cql_cluster.connect() cql_session.set_keyspace('latest') select_stmt = """ SELECT * FROM time_series WHERE bucket_userid IN ( -- go back as many spans as you need to, all shards in each span (cartesian product) '2013-09-11T05:15-0_XYZ', '2013-09-11T05:15-1_XYZ', '2013-09-11T05:30-0_XYZ', '2013-09-11T05:30-1_XYZ' ) ORDER BY timeuuid DESC; """ query = SimpleStatement(select_stmt) rows = cql_session.execute(query) pkids = set() for row in rows: if row.pkid in pkids: continue else: print row.user_id, row.timeuuid, row.colname, row.pkid pkids.add(row.pkid) # prints: # XYZ 204 Col-Name-5 1002 # XYZ 203 Col-Name-4 1000 # XYZ 201 Col-Name-2 1001 On Wed, Sep 11, 2013 at 6:13 AM, Laing, Michael <michael.la...@nytimes.com>wrote: > Then you can do this. I handle millions of entries this way and it works > well if you are mostly interested in recent activity. > > If you need to span all activity then you can use a separate table to > maintain the 'latest'. This table should also be sharded as entries will be > 'hot'. Sharding will spread the heat and the tombstones (compaction load) > around the cluster. > > -ml > > -- put this in <file> and run using 'cqlsh -f <file> > > DROP KEYSPACE latest; > > CREATE KEYSPACE latest WITH replication = { > 'class': 'SimpleStrategy', > 'replication_factor' : 1 > }; > > USE latest; > > CREATE TABLE time_series ( > bucket_userid text, -- bucket is the beginning of a datetime span > concatenated with a shard designator > pkid text, > timeuuid text, > colname text, > PRIMARY KEY (bucket_userid, timeuuid) > ); > > -- the example table is using 15 minute bucket spans and 2 shards for > illustration (you would usually use more shards) > -- adjust these appropriately for your application > > UPDATE time_series SET pkid = '1000', colname = 'Col-Name-1' where > bucket_userid = '2013-09-11T05:15-0_XYZ' AND timeuuid='200'; > UPDATE time_series SET pkid = '1001', colname = 'Col-Name-2' where > bucket_userid = '2013-09-11T05:15-1_XYZ' AND timeuuid='201'; > UPDATE time_series SET pkid = '1000', colname = 'Col-Name-3' where > bucket_userid = '2013-09-11T05:15-0_XYZ' AND timeuuid='202'; > UPDATE time_series SET pkid = '1000', colname = 'Col-Name-4' where > bucket_userid = '2013-09-11T05:30-1_XYZ' AND timeuuid='203'; > UPDATE time_series SET pkid = '1002', colname = 'Col-Name-5' where > bucket_userid = '2013-09-11T05:30-0_XYZ' AND timeuuid='204'; > > -- This query assumes that the 'current' span is 2013-09-11T05:30 and I am > interested in this span and the previous one. > > SELECT * FROM time_series > WHERE bucket_userid in ( -- go back as many spans as you need to, all > shards in each span (cartesian product) > '2013-09-11T05:15-0_XYZ', > '2013-09-11T05:15-1_XYZ', > '2013-09-11T05:30-0_XYZ', > '2013-09-11T05:30-1_XYZ' > ) > ORDER BY timeuuid DESC; > > -- returns: > -- bucket_userid | timeuuid | colname | pkid > --------------------------+----------+------------+------ > -- 2013-09-11T05:30-0_XYZ | 204 | Col-Name-5 | 1002 > -- 2013-09-11T05:30-1_XYZ | 203 | Col-Name-4 | 1000 > -- 2013-09-11T05:15-0_XYZ | 202 | Col-Name-3 | 1000 > -- 2013-09-11T05:15-1_XYZ | 201 | Col-Name-2 | 1001 > -- 2013-09-11T05:15-0_XYZ | 200 | Col-Name-1 | 1000 > > -- do a stable purge on pkid to get the result. > > > On Wed, Sep 11, 2013 at 1:01 AM, Ravikumar Govindarajan < > ravikumar.govindara...@gmail.com> wrote: > >> Thanks Michael, >> >> But I cannot sort the rows in memory, as the number of columns will be >> quite huge. >> >> From the python script above: >> select_stmt = "select * from time_series where userid = 'XYZ'" >> >> This would return me many hundreds of thousands of columns. I need to go >> in time-series order using ranges [Pagination queries]. >> >> >> On Wed, Sep 11, 2013 at 7:06 AM, Laing, Michael < >> michael.la...@nytimes.com> wrote: >> >>> If you have set up the table as described in my previous message, you >>> could run this python snippet to return the desired result: >>> >>> #!/usr/bin/env python >>> # -*- coding: utf-8 -*- >>> import logging >>> logging.basicConfig() >>> >>> from operator import itemgetter >>> >>> import cassandra >>> from cassandra.cluster import Cluster >>> from cassandra.query import SimpleStatement >>> >>> cql_cluster = Cluster() >>> cql_session = cql_cluster.connect() >>> cql_session.set_keyspace('latest') >>> >>> select_stmt = "select * from time_series where userid = 'XYZ'" >>> query = SimpleStatement(select_stmt) >>> rows = cql_session.execute(query) >>> >>> results = [] >>> for row in rows: >>> max_time = max(row.colname.keys()) >>> results.append((row.userid, row.pkid, max_time, >>> row.colname[max_time])) >>> >>> sorted_results = sorted(results, key=itemgetter(2), reverse=True) >>> for result in sorted_results: print result >>> >>> # prints: >>> >>> # (u'XYZ', u'1002', u'204', u'Col-Name-5') >>> # (u'XYZ', u'1000', u'203', u'Col-Name-4') >>> # (u'XYZ', u'1001', u'201', u'Col-Name-2') >>> >>> >>> >>> On Tue, Sep 10, 2013 at 6:32 PM, Laing, Michael < >>> michael.la...@nytimes.com> wrote: >>> >>>> You could try this. C* doesn't do it all for you, but it will >>>> efficiently get you the right data. >>>> >>>> -ml >>>> >>>> -- put this in <file> and run using 'cqlsh -f <file> >>>> >>>> DROP KEYSPACE latest; >>>> >>>> CREATE KEYSPACE latest WITH replication = { >>>> 'class': 'SimpleStrategy', >>>> 'replication_factor' : 1 >>>> }; >>>> >>>> USE latest; >>>> >>>> CREATE TABLE time_series ( >>>> userid text, >>>> pkid text, >>>> colname map<text, text>, >>>> PRIMARY KEY (userid, pkid) >>>> ); >>>> >>>> UPDATE time_series SET colname = colname + {'200':'Col-Name-1'} WHERE >>>> userid = 'XYZ' AND pkid = '1000'; >>>> UPDATE time_series SET colname = colname + >>>> {'201':'Col-Name-2'} WHERE userid = 'XYZ' AND pkid = '1001'; >>>> UPDATE time_series SET colname = colname + >>>> {'202':'Col-Name-3'} WHERE userid = 'XYZ' AND pkid = '1000'; >>>> UPDATE time_series SET colname = colname + >>>> {'203':'Col-Name-4'} WHERE userid = 'XYZ' AND pkid = '1000'; >>>> UPDATE time_series SET colname = colname + >>>> {'204':'Col-Name-5'} WHERE userid = 'XYZ' AND pkid = '1002'; >>>> >>>> SELECT * FROM time_series WHERE userid = 'XYZ'; >>>> >>>> -- returns: >>>> -- userid | pkid | colname >>>> >>>> ----------+------+----------------------------------------------------------------- >>>> -- XYZ | 1000 | {'200': 'Col-Name-1', '202': 'Col-Name-3', '203': >>>> 'Col-Name-4'} >>>> -- XYZ | 1001 | {'201': >>>> 'Col-Name-2'} >>>> -- XYZ | 1002 | {'204': >>>> 'Col-Name-5'} >>>> >>>> -- use an app to pop off the latest key/value from the map for each >>>> row, then sort by key desc. >>>> >>>> >>>> On Tue, Sep 10, 2013 at 9:21 AM, Ravikumar Govindarajan < >>>> ravikumar.govindara...@gmail.com> wrote: >>>> >>>>> I have been faced with a problem of grouping composites on the >>>>> second-part. >>>>> >>>>> Lets say my CF contains this >>>>> >>>>> >>>>> TimeSeriesCF >>>>> key: UserID >>>>> composite-col-name: TimeUUID:PKID >>>>> >>>>> Some sample data >>>>> >>>>> UserID = XYZ >>>>> Time:PKID >>>>> Col-Name1 = 200:1000 >>>>> Col-Name2 = 201:1001 >>>>> Col-Name3 = 202:1000 >>>>> Col-Name4 = 203:1000 >>>>> Col-Name5 = 204:1002 >>>>> >>>>> Whenever a time-series query is issued, it should return the following >>>>> in time-desc order. >>>>> >>>>> UserID = XYZ >>>>> Col-Name5 = 204:1002 >>>>> Col-Name4 = 203:1000 >>>>> Col-Name2 = 201:1001 >>>>> >>>>> Is something like this possible in Cassandra? Is there a different way >>>>> to design and achieve the same objective? >>>>> >>>>> -- >>>>> Ravi >>>>> >>>>> >>>> >>>> >>> >> >