Thanks for the reply.

I've now updated the background writer settings to:

# - Background Writer -

bgwriter_delay = 200ms                  # 10-10000ms between rounds
bgwriter_lru_maxpages = 400             # 0-1000 max buffers written/round
bgwriter_lru_multiplier = 2.0           # 0-10.0 multipler on buffers
scanned/round

checkpoint_segments = 128
checkpoint_timeout = 25min

It's still happening at the moment, this time without any checkpoint
entries in the log :(

Below the output from vmstat. I'm not sure what to look for in there?

Thanks again,

Armand


$ sudo vmstat 5
procs -----------memory---------- ---swap-- -----io---- --system--
-----cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id
wa st
 2  0      0 485800   4224 44781700    0    0   167    91    1    0 21  1
78  0  0
 7  0      0 353920   4224 44836176    0    0  6320    54 21371 12921 11  2
87  0  0
32  0      0 352220   4232 44749544    0    0  1110     8 19414 9620  6 42
52  0  0
 3  0      0 363044   4232 44615772    0    0    59  1943 11185 3774  0 81
18  0  0
48  0      0 360076   4240 44550744    0    0     0    34 9563 5210  0 74
26  0  0
33  0      0 413708   4240 44438248    0    0    92   962 11250 8169  0 61
39  0  0
109  0      0 418080   4240 44344596    0    0   605  3490 10098 6216  1 49
50  0  0
58  0      0 425388   4240 44286528    0    0     5    10 10794 2470  1 91
 8  0  0
53  0      0 435864   4240 44243000    0    0    11     0 9755 2428  0 92
 8  0  0
12  0      0 440792   4248 44213164    0    0   134     5 7883 3038  0 51
49  0  0
 3  0      0 440360   4256 44158684    0    0   548   146 8450 3930  2 27
70  0  0
 2  0      0 929236   4256 44248608    0    0 10466   845 22575 14196 20  5
74  0  0
 4  0      0 859160   4256 44311828    0    0  7120    61 20890 12835 12  1
86  0  0
 4  0      0 685308   4256 44369404    0    0  6110    24 20645 12545 13  1
85  0  0
 4  0      0 695440   4256 44396304    0    0  5351  1208 19529 11781 11  1
88  0  0
 4  0      0 628276   4256 44468116    0    0  9202     0 19875 12172  9  1
89  0  0
 6  0      0 579716   4256 44503848    0    0  3799    22 19223 11772 10  1
88  0  0
 3  1      0 502948   4256 44539784    0    0  3721  6700 20620 11939 13  1
85  0  0
 4  0      0 414120   4256 44583456    0    0  3860   856 19801 12092 10  1
89  0  0
 6  0      0 349240   4256 44642880    0    0  6122    48 19834 11933 11  2
87  0  0
 3  0      0 400536   4256 44535872    0    0  6287     5 18945 11461 10  1
89  0  0
 3  0      0 364256   4256 44592412    0    0  5487  2018 20145 12344 11  1
87  0  0
 7  0      0 343732   4256 44598784    0    0  4209    24 19099 11482 10  1
88  0  0
 6  0      0 339608   4236 44576768    0    0  6805   151 18821 11333  9  2
89  0  0
 9  1      0 339364   4236 44556884    0    0  2597  4339 19205 11918 11  3
85  0  0
24  0      0 341596   4236 44480368    0    0  6165  5309 19353 11562 11  4
84  1  0
30  0      0 359044   4236 44416452    0    0  1364     6 12638 6138  5 28
67  0  0
 4  0      0 436468   4224 44326500    0    0  3704  1264 11346 7545  4 27
68  0  0
 3  1      0 459736   4224 44384788    0    0  6541     8 20159 12097 11  1
88  0  0
 8  1      0 347812   4224 44462100    0    0 12292  2860 20851 12377  9  1
89  1  0
 1  0      0 379752   4224 44402396    0    0  5849   147 20171 12253 11  1
88  0  0
 4  0      0 453692   4216 44243480    0    0  6546   269 20689 13028 12  2
86  0  0
 8  0      0 390160   4216 44259768    0    0  4243     0 20476 21238  6 16
78  0  0
 6  0      0 344504   4216 44336264    0    0  7214     2 20919 12625 11  1
87  0  0
 4  0      0 350128   4200 44324976    0    0 10726  2173 20417 12351 10  1
88  0  0
 2  1      0 362300   4200 44282484    0    0  7148   714 22469 14468 12  2
86  0  0
 3  0      0 366252   4184 44311680    0    0  7617   133 20487 12364  9  1
90  0  0
 6  0      0 368904   4184 44248152    0    0  5162     6 22910 15221 14  7
80  0  0
 2  0      0 383108   4184 44276780    0    0  5846  1120 21109 12563 11  1
88  0  0
 7  0      0 338348   4184 44274472    0    0  9270     5 21243 12698 10  1
88  0  0
24  0      0 339676   4184 44213036    0    0  6639    18 22976 12700 13 12
74  0  0
12  0      0 371848   4184 44146500    0    0   657   133 18968 7445  5 53
43  0  0
37  0      0 374516   4184 44076212    0    0    16     2 9156 4472  1 48
52  0  0
16  0      0 398412   4184 43971060    0    0   127     0 9967 6018  0 48
52  0  0
 4  0      0 417312   4184 44084392    0    0 17434  1072 23661 14268 16  6
78  1  0
 4  0      0 407672   4184 44139896    0    0  5785     0 19779 11869 11  1
88  0  0
 9  0      0 349544   4184 44051596    0    0  6899     8 20376 12774 10  3
88  0  0
 5  0      0 424628   4184 44059628    0    0  9105   175 24546 15354 13 20
66  1  0
 2  0      0 377164   4184 44070564    0    0  9363     3 21191 12608 11  2
87  0  0
 5  0      0 353360   4184 44040804    0    0  6661     0 20931 12815 12  2
85  0  0
 4  0      0 355144   4180 44034620    0    0  7061     8 21264 12379 11  3
86  0  0
procs -----------memory---------- ---swap-- -----io---- --system--
-----cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id
wa st
21  0      0 358396   4180 43958420    0    0  7595  1749 23258 12299 10 27
63  0  0
 6  1      0 437480   4160 43922152    0    0 17565    14 17059 14928  6 18
74  2  0
 6  0      0 380304   4160 43993932    0    0 10120   168 21519 12798 11  2
87  0  0
 8  0      0 337740   4160 44007432    0    0  6033   520 20872 12461 11  1
88  0  0
13  0      0 349712   4132 43927784    0    0  6777     6 20919 12568 11  2
86  0  0
 6  1      0 351180   4112 43899756    0    0  8640     0 22543 12519 11 10
78  0  0
 6  0      0 356392   4112 43921532    0    0  6206    48 20383 12050 12  1
86  0  0
 6  0      0 355552   4108 43863448    0    0  6106     3 21244 11817  9  9
82  0  0
 3  0      0 364992   7312 43856824    0    0 11283   199 21296 12638 13  2
85  0  0
 4  1      0 371968   7120 43818552    0    0  6715  1534 22322 13305 11  7
81  0  0
debug2: channel 0: window 999365 sent adjust 49211
12  0      0 338540   7120 43822256    0    0  9142     3 21520 12194 13  5
82  0  0
 8  0      0 386016   7112 43717136    0    0  2123     3 20465 11466  8 20
72  0  0
 8  0      0 352388   7112 43715872    0    0 10366    51 25758 13879 16 19
65  0  0
20  0      0 351472   7112 43701060    0    0 13091    10 23766 12832 11 11
77  1  0
 2  0      0 386820   7112 43587520    0    0   482   210 17187 6773  3 69
28  0  0
64  0      0 401956   7112 43473728    0    0     0     5 10796 9487  0 55
44  0  0


On Tue, Apr 2, 2013 at 12:56 AM, Vasilis Ventirozos
<v.ventiro...@gmail.com>wrote:

> Hello, i think that your system during the checkpoint pauses all clients
> in order to flush all data from controller's cache to the disks if i were
> you i'd try to tune my checkpoint parameters better, if that doesn't work,
> show us some vmstat output please
>
> Vasilis Ventirozos
> ---------- Forwarded message ----------
> From: "Armand du Plessis" <a...@bank.io>
> Date: Apr 2, 2013 1:37 AM
> Subject: [PERFORM] Problems with pg_locks explosion
> To: "pgsql-performance" <pgsql-performance@postgresql.org>
> Cc:
>
> [Apologies, I first sent this to the incorrect list, postgres-admin, in
> the event you receive it twice]
>
> Hi there,
>
> I'm hoping someone on the list can shed some light on an issue I'm having
> with our Postgresql cluster. I'm literally tearing out my hair and don't
> have a deep enough understanding of Postgres to find the problem.
>
> What's happening is I had severe disk/io issues on our original Postgres
> cluster (9.0.8) and switched to a new instance with a RAID-0 volume array.
> The machine's CPU usage would hover around 30% and our database would run
> lightning fast with pg_locks hovering between 100-200.
>
> Within a few seconds something would trigger a massive increase in
> pg_locks so that it suddenly shoots up to 4000-8000. At this point
> everything dies. Queries that usually take a few milliseconds takes minutes
> and everything is unresponsive until I restart postgres.
>
> The instance still idles at this point. The only clue I could find was
> that it usually starts a few minutes after the checkpoint entries appear in
> my logs.
>
> Any suggestions would really be appreciated. It's killing our business at
> the moment. I can supply more info if required but pasted what I thought
> would be useful below. Not sure what else to change in the settings.
>
> Kind regards,
>
> Armand
>
>
>
> It's on Amazon EC2 -
> * cc2.8xlarge instance type
> * 6 volumes in RAID-0 configuration. (1000 PIOPS)
>
> 60.5 GiB of memory
> 88 EC2 Compute Units (2 x Intel Xeon E5-2670, eight-core)
> 3370 GB of instance storage
> 64-bit platform
> I/O Performance: Very High (10 Gigabit Ethernet)
> EBS-Optimized Available: No**
> API name: cc2.8xlarge
>
>
> postgresql.conf
> fsync = off
> full_page_writes = off
> default_statistics_target = 100
> maintenance_work_mem = 1GB
> checkpoint_completion_target = 0.9
> effective_cache_size = 48GB
> work_mem = 64MB
> wal_buffers = -1
>  checkpoint_segments = 128
> shared_buffers = 32GB
> max_connections = 800
>  effective_io_concurrency = 3 # Down from 6
>
> # - Background Writer -
>
> #bgwriter_delay = 200ms                 # 10-10000ms between rounds
> #bgwriter_lru_maxpages = 100            # 0-1000 max buffers written/round
> #bgwriter_lru_multiplier = 2.0          # 0-10.0 multipler on buffers
> scanned/round
>
>
> $ free
>              total       used       free     shared    buffers     cached
> Mem:      61368192   60988180     380012          0        784   44167172
> -/+ buffers/cache:   16820224   44547968
> Swap:            0          0          0
>
> $ top -c
> top - 21:55:51 up 12 days, 12:41,  4 users,  load average: 6.03, 16.10,
> 24.15
> top - 21:55:54 up 12 days, 12:41,  4 users,  load average: 6.03, 15.94,
> 24.06
> Tasks: 837 total,   6 running, 831 sleeping,   0 stopped,   0 zombie
> Cpu(s): 15.7%us,  1.7%sy,  0.0%ni, 81.6%id,  0.3%wa,  0.0%hi,  0.6%si,
>  0.0%st
> Mem:  61368192k total, 54820988k used,  6547204k free,     9032k buffer
>
> [ec2-user@ip-10-155-231-112 ~]$ sudo iostat
> Linux 3.2.39-6.88.amzn1.x86_64 ()  04/01/2013  _x86_64_ (32 CPU)
>
> avg-cpu:  %user   %nice %system %iowait  %steal   %idle
>           21.00    0.00    1.10    0.26    0.00   77.63
>
> Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
> xvda              0.21         5.00         2.22    5411830    2401368
> xvdk             98.32      1774.67       969.86 1919359965 1048932113
> xvdj             98.28      1773.68       969.14 1918288697 1048156776
> xvdi             98.29      1773.69       969.61 1918300250 1048662470
> xvdh             98.24      1773.92       967.54 1918544618 1046419936
> xvdg             98.27      1774.15       968.85 1918790636 1047842846
> xvdf             98.32      1775.56       968.69 1920316435 1047668172
> md127           733.85     10645.68      5813.70 11513598393 6287682313
>
> What bugs me on this is the throughput percentage on the volumes in
> Cloudwatch is 100% on all volumes.
>
> The problems seem to overlap with checkpoints.
>
> 2013-04-01 21:31:35.592 UTC,,,26877,,5159fa5f.68fd,1,,2013-04-01 21:21:35
> UTC,,0,LOG,00000,"checkpoint starting: time",,,,,,,,,""
> 2013-04-01 21:40:35.033 UTC,,,26877,,5159fa5f.68fd,2,,2013-04-01 21:21:35
> UTC,,0,LOG,00000,"checkpoint complete: wrote 100635 buffers (2.4%); 0
> transaction log file(s) added, 0 removed, 1 recycled; write=539.439 s,
> sync=0.000 s, total=539.441 s; sync files=0, longest=0.000 s, average=0.000
> s",,,,,,,,,""
> 2013-04-01 21:41:35.093 UTC,,,26877,,5159fa5f.68fd,3,,2013-04-01 21:21:35
> UTC,,0,LOG,00000,"checkpoint starting: time",,,,,,,,,""
>
>
>
>
>

Reply via email to