Apologies, the checkpoint log entry was a few seconds after I sent this
email. Now pg_locks on 7000.

And vmstat:

procs -----------memory---------- ---swap-- -----io---- --system--
-----cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id
wa st
 7  0      0 351212   7032 37412872    0    0   167    91    0    1 21  1
78  0  0
 4  0      0 350600   7040 37374464    0    0  6077   332 12889 6634  7  3
90  0  0
40  0      0 346244   7040 37310604    0    0  3687  2638 16355 5517  7 31
61  0  0
27  0      0 385620   7040 37206560    0    0    69  1587 14483 4108  3 75
22  0  0


On Tue, Apr 2, 2013 at 1:09 AM, Armand du Plessis <a...@bank.io> wrote:

> Thanks for the reply.
>
> I've now updated the background writer settings to:
>
> # - Background Writer -
>
> bgwriter_delay = 200ms                  # 10-10000ms between rounds
> bgwriter_lru_maxpages = 400             # 0-1000 max buffers written/round
> bgwriter_lru_multiplier = 2.0           # 0-10.0 multipler on buffers
> scanned/round
>
> checkpoint_segments = 128
> checkpoint_timeout = 25min
>
> It's still happening at the moment, this time without any checkpoint
> entries in the log :(
>
> Below the output from vmstat. I'm not sure what to look for in there?
>
> Thanks again,
>
> Armand
>
>
> $ sudo vmstat 5
> procs -----------memory---------- ---swap-- -----io---- --system--
> -----cpu-----
>  r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id
> wa st
>  2  0      0 485800   4224 44781700    0    0   167    91    1    0 21  1
> 78  0  0
>  7  0      0 353920   4224 44836176    0    0  6320    54 21371 12921 11
>  2 87  0  0
> 32  0      0 352220   4232 44749544    0    0  1110     8 19414 9620  6 42
> 52  0  0
>  3  0      0 363044   4232 44615772    0    0    59  1943 11185 3774  0 81
> 18  0  0
> 48  0      0 360076   4240 44550744    0    0     0    34 9563 5210  0 74
> 26  0  0
> 33  0      0 413708   4240 44438248    0    0    92   962 11250 8169  0 61
> 39  0  0
> 109  0      0 418080   4240 44344596    0    0   605  3490 10098 6216  1
> 49 50  0  0
> 58  0      0 425388   4240 44286528    0    0     5    10 10794 2470  1 91
>  8  0  0
> 53  0      0 435864   4240 44243000    0    0    11     0 9755 2428  0 92
>  8  0  0
> 12  0      0 440792   4248 44213164    0    0   134     5 7883 3038  0 51
> 49  0  0
>  3  0      0 440360   4256 44158684    0    0   548   146 8450 3930  2 27
> 70  0  0
>  2  0      0 929236   4256 44248608    0    0 10466   845 22575 14196 20
>  5 74  0  0
>  4  0      0 859160   4256 44311828    0    0  7120    61 20890 12835 12
>  1 86  0  0
>  4  0      0 685308   4256 44369404    0    0  6110    24 20645 12545 13
>  1 85  0  0
>  4  0      0 695440   4256 44396304    0    0  5351  1208 19529 11781 11
>  1 88  0  0
>  4  0      0 628276   4256 44468116    0    0  9202     0 19875 12172  9
>  1 89  0  0
>  6  0      0 579716   4256 44503848    0    0  3799    22 19223 11772 10
>  1 88  0  0
>  3  1      0 502948   4256 44539784    0    0  3721  6700 20620 11939 13
>  1 85  0  0
>  4  0      0 414120   4256 44583456    0    0  3860   856 19801 12092 10
>  1 89  0  0
>  6  0      0 349240   4256 44642880    0    0  6122    48 19834 11933 11
>  2 87  0  0
>  3  0      0 400536   4256 44535872    0    0  6287     5 18945 11461 10
>  1 89  0  0
>  3  0      0 364256   4256 44592412    0    0  5487  2018 20145 12344 11
>  1 87  0  0
>  7  0      0 343732   4256 44598784    0    0  4209    24 19099 11482 10
>  1 88  0  0
>  6  0      0 339608   4236 44576768    0    0  6805   151 18821 11333  9
>  2 89  0  0
>  9  1      0 339364   4236 44556884    0    0  2597  4339 19205 11918 11
>  3 85  0  0
> 24  0      0 341596   4236 44480368    0    0  6165  5309 19353 11562 11
>  4 84  1  0
> 30  0      0 359044   4236 44416452    0    0  1364     6 12638 6138  5 28
> 67  0  0
>  4  0      0 436468   4224 44326500    0    0  3704  1264 11346 7545  4 27
> 68  0  0
>  3  1      0 459736   4224 44384788    0    0  6541     8 20159 12097 11
>  1 88  0  0
>  8  1      0 347812   4224 44462100    0    0 12292  2860 20851 12377  9
>  1 89  1  0
>  1  0      0 379752   4224 44402396    0    0  5849   147 20171 12253 11
>  1 88  0  0
>  4  0      0 453692   4216 44243480    0    0  6546   269 20689 13028 12
>  2 86  0  0
>  8  0      0 390160   4216 44259768    0    0  4243     0 20476 21238  6
> 16 78  0  0
>  6  0      0 344504   4216 44336264    0    0  7214     2 20919 12625 11
>  1 87  0  0
>  4  0      0 350128   4200 44324976    0    0 10726  2173 20417 12351 10
>  1 88  0  0
>  2  1      0 362300   4200 44282484    0    0  7148   714 22469 14468 12
>  2 86  0  0
>  3  0      0 366252   4184 44311680    0    0  7617   133 20487 12364  9
>  1 90  0  0
>  6  0      0 368904   4184 44248152    0    0  5162     6 22910 15221 14
>  7 80  0  0
>  2  0      0 383108   4184 44276780    0    0  5846  1120 21109 12563 11
>  1 88  0  0
>  7  0      0 338348   4184 44274472    0    0  9270     5 21243 12698 10
>  1 88  0  0
> 24  0      0 339676   4184 44213036    0    0  6639    18 22976 12700 13
> 12 74  0  0
> 12  0      0 371848   4184 44146500    0    0   657   133 18968 7445  5 53
> 43  0  0
> 37  0      0 374516   4184 44076212    0    0    16     2 9156 4472  1 48
> 52  0  0
> 16  0      0 398412   4184 43971060    0    0   127     0 9967 6018  0 48
> 52  0  0
>  4  0      0 417312   4184 44084392    0    0 17434  1072 23661 14268 16
>  6 78  1  0
>  4  0      0 407672   4184 44139896    0    0  5785     0 19779 11869 11
>  1 88  0  0
>  9  0      0 349544   4184 44051596    0    0  6899     8 20376 12774 10
>  3 88  0  0
>  5  0      0 424628   4184 44059628    0    0  9105   175 24546 15354 13
> 20 66  1  0
>  2  0      0 377164   4184 44070564    0    0  9363     3 21191 12608 11
>  2 87  0  0
>  5  0      0 353360   4184 44040804    0    0  6661     0 20931 12815 12
>  2 85  0  0
>  4  0      0 355144   4180 44034620    0    0  7061     8 21264 12379 11
>  3 86  0  0
> procs -----------memory---------- ---swap-- -----io---- --system--
> -----cpu-----
>  r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id
> wa st
> 21  0      0 358396   4180 43958420    0    0  7595  1749 23258 12299 10
> 27 63  0  0
>  6  1      0 437480   4160 43922152    0    0 17565    14 17059 14928  6
> 18 74  2  0
>  6  0      0 380304   4160 43993932    0    0 10120   168 21519 12798 11
>  2 87  0  0
>  8  0      0 337740   4160 44007432    0    0  6033   520 20872 12461 11
>  1 88  0  0
> 13  0      0 349712   4132 43927784    0    0  6777     6 20919 12568 11
>  2 86  0  0
>  6  1      0 351180   4112 43899756    0    0  8640     0 22543 12519 11
> 10 78  0  0
>  6  0      0 356392   4112 43921532    0    0  6206    48 20383 12050 12
>  1 86  0  0
>  6  0      0 355552   4108 43863448    0    0  6106     3 21244 11817  9
>  9 82  0  0
>  3  0      0 364992   7312 43856824    0    0 11283   199 21296 12638 13
>  2 85  0  0
>  4  1      0 371968   7120 43818552    0    0  6715  1534 22322 13305 11
>  7 81  0  0
> debug2: channel 0: window 999365 sent adjust 49211
> 12  0      0 338540   7120 43822256    0    0  9142     3 21520 12194 13
>  5 82  0  0
>  8  0      0 386016   7112 43717136    0    0  2123     3 20465 11466  8
> 20 72  0  0
>  8  0      0 352388   7112 43715872    0    0 10366    51 25758 13879 16
> 19 65  0  0
> 20  0      0 351472   7112 43701060    0    0 13091    10 23766 12832 11
> 11 77  1  0
>  2  0      0 386820   7112 43587520    0    0   482   210 17187 6773  3 69
> 28  0  0
> 64  0      0 401956   7112 43473728    0    0     0     5 10796 9487  0 55
> 44  0  0
>
>
> On Tue, Apr 2, 2013 at 12:56 AM, Vasilis Ventirozos <
> v.ventiro...@gmail.com> wrote:
>
>> Hello, i think that your system during the checkpoint pauses all clients
>> in order to flush all data from controller's cache to the disks if i were
>> you i'd try to tune my checkpoint parameters better, if that doesn't work,
>> show us some vmstat output please
>>
>> Vasilis Ventirozos
>> ---------- Forwarded message ----------
>> From: "Armand du Plessis" <a...@bank.io>
>> Date: Apr 2, 2013 1:37 AM
>> Subject: [PERFORM] Problems with pg_locks explosion
>> To: "pgsql-performance" <pgsql-performance@postgresql.org>
>> Cc:
>>
>> [Apologies, I first sent this to the incorrect list, postgres-admin, in
>> the event you receive it twice]
>>
>> Hi there,
>>
>> I'm hoping someone on the list can shed some light on an issue I'm having
>> with our Postgresql cluster. I'm literally tearing out my hair and don't
>> have a deep enough understanding of Postgres to find the problem.
>>
>> What's happening is I had severe disk/io issues on our original Postgres
>> cluster (9.0.8) and switched to a new instance with a RAID-0 volume array.
>> The machine's CPU usage would hover around 30% and our database would run
>> lightning fast with pg_locks hovering between 100-200.
>>
>> Within a few seconds something would trigger a massive increase in
>> pg_locks so that it suddenly shoots up to 4000-8000. At this point
>> everything dies. Queries that usually take a few milliseconds takes minutes
>> and everything is unresponsive until I restart postgres.
>>
>> The instance still idles at this point. The only clue I could find was
>> that it usually starts a few minutes after the checkpoint entries appear in
>> my logs.
>>
>> Any suggestions would really be appreciated. It's killing our business at
>> the moment. I can supply more info if required but pasted what I thought
>> would be useful below. Not sure what else to change in the settings.
>>
>> Kind regards,
>>
>> Armand
>>
>>
>>
>> It's on Amazon EC2 -
>> * cc2.8xlarge instance type
>> * 6 volumes in RAID-0 configuration. (1000 PIOPS)
>>
>> 60.5 GiB of memory
>> 88 EC2 Compute Units (2 x Intel Xeon E5-2670, eight-core)
>> 3370 GB of instance storage
>> 64-bit platform
>> I/O Performance: Very High (10 Gigabit Ethernet)
>> EBS-Optimized Available: No**
>> API name: cc2.8xlarge
>>
>>
>> postgresql.conf
>> fsync = off
>> full_page_writes = off
>> default_statistics_target = 100
>> maintenance_work_mem = 1GB
>> checkpoint_completion_target = 0.9
>> effective_cache_size = 48GB
>> work_mem = 64MB
>> wal_buffers = -1
>>  checkpoint_segments = 128
>> shared_buffers = 32GB
>> max_connections = 800
>>  effective_io_concurrency = 3 # Down from 6
>>
>> # - Background Writer -
>>
>> #bgwriter_delay = 200ms                 # 10-10000ms between rounds
>> #bgwriter_lru_maxpages = 100            # 0-1000 max buffers written/round
>> #bgwriter_lru_multiplier = 2.0          # 0-10.0 multipler on buffers
>> scanned/round
>>
>>
>> $ free
>>              total       used       free     shared    buffers     cached
>> Mem:      61368192   60988180     380012          0        784   44167172
>> -/+ buffers/cache:   16820224   44547968
>> Swap:            0          0          0
>>
>> $ top -c
>> top - 21:55:51 up 12 days, 12:41,  4 users,  load average: 6.03, 16.10,
>> 24.15
>> top - 21:55:54 up 12 days, 12:41,  4 users,  load average: 6.03, 15.94,
>> 24.06
>> Tasks: 837 total,   6 running, 831 sleeping,   0 stopped,   0 zombie
>> Cpu(s): 15.7%us,  1.7%sy,  0.0%ni, 81.6%id,  0.3%wa,  0.0%hi,  0.6%si,
>>  0.0%st
>> Mem:  61368192k total, 54820988k used,  6547204k free,     9032k buffer
>>
>> [ec2-user@ip-10-155-231-112 ~]$ sudo iostat
>> Linux 3.2.39-6.88.amzn1.x86_64 ()  04/01/2013  _x86_64_ (32 CPU)
>>
>> avg-cpu:  %user   %nice %system %iowait  %steal   %idle
>>           21.00    0.00    1.10    0.26    0.00   77.63
>>
>> Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
>> xvda              0.21         5.00         2.22    5411830    2401368
>> xvdk             98.32      1774.67       969.86 1919359965 1048932113
>> xvdj             98.28      1773.68       969.14 1918288697 1048156776
>> xvdi             98.29      1773.69       969.61 1918300250 1048662470
>> xvdh             98.24      1773.92       967.54 1918544618 1046419936
>> xvdg             98.27      1774.15       968.85 1918790636 1047842846
>> xvdf             98.32      1775.56       968.69 1920316435 1047668172
>> md127           733.85     10645.68      5813.70 11513598393 6287682313
>>
>> What bugs me on this is the throughput percentage on the volumes in
>> Cloudwatch is 100% on all volumes.
>>
>> The problems seem to overlap with checkpoints.
>>
>> 2013-04-01 21:31:35.592 UTC,,,26877,,5159fa5f.68fd,1,,2013-04-01 21:21:35
>> UTC,,0,LOG,00000,"checkpoint starting: time",,,,,,,,,""
>> 2013-04-01 21:40:35.033 UTC,,,26877,,5159fa5f.68fd,2,,2013-04-01 21:21:35
>> UTC,,0,LOG,00000,"checkpoint complete: wrote 100635 buffers (2.4%); 0
>> transaction log file(s) added, 0 removed, 1 recycled; write=539.439 s,
>> sync=0.000 s, total=539.441 s; sync files=0, longest=0.000 s, average=0.000
>> s",,,,,,,,,""
>> 2013-04-01 21:41:35.093 UTC,,,26877,,5159fa5f.68fd,3,,2013-04-01 21:21:35
>> UTC,,0,LOG,00000,"checkpoint starting: time",,,,,,,,,""
>>
>>
>>
>>
>>
>

Reply via email to