I have run the command sent by you(thanks for the link, I understood
why is necessary to run it), and I create a file of 21GB and has been
wroted on the same osd(while the space of the partition is the same,
much less that it should be). PThe output of the crush dump is:
ceph osd crush dump
{
"devices": [
{
"id": 0,
"name": "osd.0",
"class": "ssd"
},
{
"id": 1,
"name": "osd.1",
"class": "ssd"
},
{
"id": 2,
"name": "osd.2",
"class": "ssd"
},
{
"id": 3,
"name": "osd.3",
"class": "ssd"
},
{
"id": 4,
"name": "osd.4",
"class": "ssd"
},
{
"id": 5,
"name": "osd.5",
"class": "ssd"
},
{
"id": 6,
"name": "osd.6",
"class": "ssd"
},
{
"id": 7,
"name": "osd.7",
"class": "ssd"
},
{
"id": 8,
"name": "osd.8",
"class": "ssd"
},
{
"id": 9,
"name": "osd.9",
"class": "ssd"
},
{
"id": 10,
"name": "osd.10",
"class": "ssd"
},
{
"id": 11,
"name": "osd.11",
"class": "ssd"
}
],
"types": [
{
"type_id": 0,
"name": "osd"
},
{
"type_id": 1,
"name": "host"
},
{
"type_id": 2,
"name": "chassis"
},
{
"type_id": 3,
"name": "rack"
},
{
"type_id": 4,
"name": "row"
},
{
"type_id": 5,
"name": "pdu"
},
{
"type_id": 6,
"name": "pod"
},
{
"type_id": 7,
"name": "room"
},
{
"type_id": 8,
"name": "datacenter"
},
{
"type_id": 9,
"name": "zone"
},
{
"type_id": 10,
"name": "region"
},
{
"type_id": 11,
"name": "root"
}
],
"buckets": [
{
"id": -1,
"name": "default",
"type_id": 11,
"type_name": "root",
"weight": 10986996,
"alg": "straw2",
"hash": "rjenkins1",
"items": [
{
"id": -3,
"weight": 10986996,
"pos": 0
}
]
},
{
"id": -2,
"name": "default~ssd",
"type_id": 11,
"type_name": "root",
"weight": 10986996,
"alg": "straw2",
"hash": "rjenkins1",
"items": [
{
"id": -4,
"weight": 10986996,
"pos": 0
}
]
},
{
"id": -3,
"name": "sto-core-hpc01",
"type_id": 1,
"type_name": "host",
"weight": 10986996,
"alg": "straw2",
"hash": "rjenkins1",
"items": [
{
"id": 0,
"weight": 915583,
"pos": 0
},
{
"id": 1,
"weight": 915583,
"pos": 1
},
{
"id": 10,
"weight": 915583,
"pos": 2
},
{
"id": 11,
"weight": 915583,
"pos": 3
},
{
"id": 2,
"weight": 915583,
"pos": 4
},
{
"id": 3,
"weight": 915583,
"pos": 5
},
{
"id": 4,
"weight": 915583,
"pos": 6
},
{
"id": 5,
"weight": 915583,
"pos": 7
},
{
"id": 6,
"weight": 915583,
"pos": 8
},
{
"id": 7,
"weight": 915583,
"pos": 9
},
{
"id": 8,
"weight": 915583,
"pos": 10
},
{
"id": 9,
"weight": 915583,
"pos": 11
}
]
},
{
"id": -4,
"name": "sto-core-hpc01~ssd",
"type_id": 1,
"type_name": "host",
"weight": 10986996,
"alg": "straw2",
"hash": "rjenkins1",
"items": [
{
"id": 0,
"weight": 915583,
"pos": 0
},
{
"id": 1,
"weight": 915583,
"pos": 1
},
{
"id": 10,
"weight": 915583,
"pos": 2
},
{
"id": 11,
"weight": 915583,
"pos": 3
},
{
"id": 2,
"weight": 915583,
"pos": 4
},
{
"id": 3,
"weight": 915583,
"pos": 5
},
{
"id": 4,
"weight": 915583,
"pos": 6
},
{
"id": 5,
"weight": 915583,
"pos": 7
},
{
"id": 6,
"weight": 915583,
"pos": 8
},
{
"id": 7,
"weight": 915583,
"pos": 9
},
{
"id": 8,
"weight": 915583,
"pos": 10
},
{
"id": 9,
"weight": 915583,
"pos": 11
}
]
}
],
"rules": [
{
"rule_id": 0,
"rule_name": "replicated_rule",
"type": 1,
"steps": [
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "chooseleaf_firstn",
"num": 0,
"type": "host"
},
{
"op": "emit"
}
]
}
],
"tunables": {
"choose_local_tries": 0,
"choose_local_fallback_tries": 0,
"choose_total_tries": 50,
"chooseleaf_descend_once": 1,
"chooseleaf_vary_r": 1,
"chooseleaf_stable": 1,
"msr_descents": 100,
"msr_collision_tries": 100,
"straw_calc_version": 1,
"allowed_bucket_algs": 54,
"profile": "jewel",
"optimal_tunables": 1,
"legacy_tunables": 0,
"minimum_required_version": "jewel",
"require_feature_tunables": 1,
"require_feature_tunables2": 1,
"has_v2_rules": 0,
"require_feature_tunables3": 1,
"has_v3_rules": 0,
"has_v4_buckets": 1,
"require_feature_tunables5": 1,
"has_v5_rules": 0,
"has_msr_rules": 0
},
"choose_args": {
"-1": [
{
"bucket_id": -1,
"weight_set": [
[
167.64825439453125
]
]
},
{
"bucket_id": -2,
"weight_set": [
[
167.64825439453125
]
]
},
{
"bucket_id": -3,
"weight_set": [
[
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938
]
]
},
{
"bucket_id": -4,
"weight_set": [
[
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938,
13.970687866210938
]
]
}
]
}
}
Best,
Mihai
On 2025-03-28 14:27, Anthony D'Atri wrote:
Yikes, something is off here.
Is your entire cluster on one host? If so, beware that this is a
very risky proposition.
ceph config set global osd_crush_chooseleaf_type 0
https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-pg/#one-node-cluster
I suspect that will at least distribute your data better.
If not, please send `ceph osd crush dump`
On Mar 28, 2025, at 6:59 AM, Mihai Ciubancan
<mihai.ciuban...@eli-np.ro> wrote:
Hi Anthony,
Thanks for the answer:
The output of 'ceph osd df' is:
ceph osd df tree
ID CLASS WEIGHT REWEIGHT SIZE RAW USE DATA OMAP
META AVAIL %USE VAR PGS STATUS TYPE NAME
-1 167.64825 - 168 TiB 12 TiB 12 TiB 100 MiB
27 GiB 156 TiB 7.13 1.00 - root default
-3 167.64825 - 168 TiB 12 TiB 12 TiB 100 MiB
27 GiB 156 TiB 7.13 1.00 - host sto-core-hpc01
0 ssd 13.97069 1.00000 14 TiB 32 MiB 4.1 MiB 12 KiB
28 MiB 14 TiB 0 0 0 up osd.0
1 ssd 13.97069 1.00000 14 TiB 12 TiB 12 TiB 6 KiB
26 GiB 2.0 TiB 85.53 12.00 129 up osd.1
2 ssd 13.97069 1.00000 14 TiB 32 MiB 4.1 MiB 12 KiB
28 MiB 14 TiB 0 0 0 up osd.2
3 ssd 13.97069 1.00000 14 TiB 1.7 GiB 258 MiB 100 MiB
1.3 GiB 14 TiB 0.01 0.00 16 up osd.3
4 ssd 13.97069 1.00000 14 TiB 32 MiB 4.1 MiB 12 KiB
28 MiB 14 TiB 0 0 0 up osd.4
5 ssd 13.97069 1.00000 14 TiB 32 MiB 4.1 MiB 12 KiB
28 MiB 14 TiB 0 0 0 up osd.5
6 ssd 13.97069 1.00000 14 TiB 32 MiB 4.1 MiB 12 KiB
28 MiB 14 TiB 0 0 0 up osd.6
7 ssd 13.97069 1.00000 14 TiB 32 MiB 4.1 MiB 12 KiB
28 MiB 14 TiB 0 0 0 up osd.7
8 ssd 13.97069 1.00000 14 TiB 68 MiB 4.8 MiB 12 KiB
63 MiB 14 TiB 0 0 1 up osd.8
9 ssd 13.97069 1.00000 14 TiB 32 MiB 4.1 MiB 12 KiB
28 MiB 14 TiB 0 0 0 up osd.9
10 ssd 13.97069 1.00000 14 TiB 32 MiB 4.1 MiB 12 KiB
28 MiB 14 TiB 0 0 0 up osd.10
11 ssd 13.97069 1.00000 14 TiB 68 MiB 4.8 MiB 12 KiB
63 MiB 14 TiB 0 0 1 up osd.11
TOTAL 168 TiB 12 TiB 12 TiB 100 MiB 27 GiB 156 TiB 7.13
So all the date is on osd.1
But I have checked the balancer and seems active:
ceph balancer status
{
"active": true,
"last_optimize_duration": "0:00:00.000368",
"last_optimize_started": "Fri Mar 28 10:55:06 2025",
"mode": "upmap",
"no_optimization_needed": false,
"optimize_result": "Some objects (0.500000) are degraded; try again
later",
"plans": []
}
But the output of the commnad 'ceph config dump|grep balancer' gives
me nothing.
Best,
Mihai
On 2025-03-27 23:06, Anthony D'Atri wrote:
Look at `ceph osd df`. Is the balancer enabled?
On Mar 27, 2025, at 8:50 AM, Mihai Ciubancan
<mihai.ciuban...@eli-np.ro> wrote:
Hello,
My name is Mihai, and I have started using CEPH this mount for a HPC
cluster.
When was lunch in the production the available space shown was 80TB
now is 16TB and I didn't do anything, while I'm having 12 OSD (SSD of
14TB):
sudo ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 167.64825 root default
-3 167.64825 host sto-core-hpc01
0 ssd 13.97069 osd.0 up 1.00000 1.00000
1 ssd 13.97069 osd.1 up 1.00000 1.00000
2 ssd 13.97069 osd.2 up 1.00000 1.00000
3 ssd 13.97069 osd.3 up 1.00000 1.00000
4 ssd 13.97069 osd.4 up 1.00000 1.00000
5 ssd 13.97069 osd.5 up 1.00000 1.00000
6 ssd 13.97069 osd.6 up 1.00000 1.00000
7 ssd 13.97069 osd.7 up 1.00000 1.00000
8 ssd 13.97069 osd.8 up 1.00000 1.00000
9 ssd 13.97069 osd.9 up 1.00000 1.00000
10 ssd 13.97069 osd.10 up 1.00000 1.00000
11 ssd 13.97069 osd.11 up 1.00000 1.00000
sudo ceph df detail
--- RAW STORAGE ---
CLASS SIZE AVAIL USED RAW USED %RAW USED
ssd 168 TiB 156 TiB 12 TiB 12 TiB 7.12
TOTAL 168 TiB 156 TiB 12 TiB 12 TiB 7.12
--- POOLS ---
POOL ID PGS STORED (DATA) (OMAP) OBJECTS
USED (DATA) (OMAP) %USED MAX AVAIL QUOTA OBJECTS QUOTA BYTES
DIRTY USED COMPR UNDER COMPR
.mgr 1 1 705 KiB 705 KiB 0 B 2 1.4
MiB 1.4 MiB 0 B 0 8.1 TiB N/A N/A N/A
0 B 0 B
cephfs.cephfs.meta 2 16 270 MiB 270 MiB 0 B 85.96k 270
MiB 270 MiB 0 B 0 16 TiB N/A N/A N/A
0 B 0 B
cephfs.cephfs.data 3 129 12 TiB 12 TiB 0 B 3.73M 12 TiB
12 TiB 0 B 42.49 16 TiB N/A N/A N/A 0
B 0 B
While on the client side I have this:
$ df -h
10.18.31.1:6789:/ 21T 13T 8.1T 61% /data
I don't know where it's gone all the space that was at the beginning.
Someone has any hint?
Best regards,
Mihai
_______________________________________________
ceph-users mailing list -- ceph-users@ceph.io
To unsubscribe send an email to ceph-users-le...@ceph.io
_______________________________________________
ceph-users mailing list -- ceph-users@ceph.io
To unsubscribe send an email to ceph-users-le...@ceph.io
_______________________________________________
ceph-users mailing list -- ceph-users@ceph.io
To unsubscribe send an email to ceph-users-le...@ceph.io