Skip to content

Instantly share code, notes, and snippets.

@linuxkidd
Created May 5, 2020 15:34
Show Gist options
  • Select an option

  • Save linuxkidd/019d54ecbdb4bc0d53b37fd1125ec5b1 to your computer and use it in GitHub Desktop.

Select an option

Save linuxkidd/019d54ecbdb4bc0d53b37fd1125ec5b1 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import argparse,json,math,re,signal,sys
from subprocess import Popen, PIPE
from StringIO import StringIO
from pprint import pprint
parser = argparse.ArgumentParser(description='Custom script inconsistent PG cleanup')
parser.add_argument("--id",dest="pgid",required=True,help="Provide the PG id for this list-inconstent-obj output.")
args=parser.parse_args()
pgid=args.pgid
#debug=True
debug=False
if not sys.stdin.isatty():
print("Loading list-inconsistent-obj from stdin")
obj=json.load(sys.stdin)
else:
print("Fetching list-inconsistent-obj for PG {}".format(pgid))
cmd=["rados","list-inconsistent-obj",pgid]
cmd_pipe = Popen(cmd,stdout=PIPE,stderr=PIPE)
cmdout,cmderr = cmd_pipe.communicate()
if cmd_pipe.returncode!=0:
print("Command: {}\nFailed: {}\nExit Code: {}".format(cmd,cmderr,cmd_pipe.returncode))
exit(1)
else:
obj=json.load(StringIO(cmdout))
collonregex=re.compile("::*")
unsaferegex=re.compile("omap_")
hadincons=0
printrepair=0
def signal_handler(signal, frame):
print('You pressed Ctrl+C! Exiting...')
print('')
exit(1)
signal.signal(signal.SIGINT, signal_handler)
def parse_shards(shards):
master={}
currentosd=-1
shardcount=0
for shard in shards:
shardcount+=1
currentosd=shard['osd']
for key,val in shard.items():
if key in ["osd","primary"]:
continue
if key not in master:
master[key]={}
if key=="errors":
for error in val:
if re.match(unsaferegex,error) is not None:
print("Unsafe to repair omap mismatch with this script. Exiting to preserve data.")
exit(2)
if error not in master[key]:
master[key][error]=[]
master[key][error].append(currentosd)
else:
if val not in master[key]:
master[key][val]=[]
master[key][val].append(currentosd)
goodosd=-1
blacklistosd={}
for key,value in master.items():
if key=="errors":
for param,osds in value.items():
if len(osds)<shardcount:
for osd in osds:
blacklistosd[osd]=1
continue
for param,osds in value.items():
if len(osds)==shardcount:
continue
if len(osds)==(shardcount-1):
for i in range(0,len(osds)-1):
if osds[i] in blacklistosd:
continue
goodosd=osds[i]
if goodosd==-1:
continue
print("Matched OSD data:")
print("Key,Val,OSD")
print("{},{},{}".format(key,param,goodosd))
if goodosd!=-1:
break
if goodosd==-1:
if len(blacklistosd)==0:
print("All OSDs errored, no difference among them, Using first OSD: osd.{}".format(shards[0]['osd']))
return shards[0]['osd']
for i in range(0,shardcount-1):
if shards[i]['osd'] in blacklistosd:
continue
goodosd=shards[i]['osd']
if goodosd==-1:
print("Failed to find good osd. Please manually review this PG.")
exit(1)
print("Using non-errored OSD: osd.{}".format(goodosd))
return goodosd
if len(obj['inconsistents'])==0:
print("No inconsistent objects in PG {}. Issuing repair.".format(pgid))
cmd=["ceph","pg","repair",pgid]
cmd_pipe = Popen(cmd,stdout=PIPE,stderr=PIPE)
cmdout,cmderr = cmd_pipe.communicate()
if cmd_pipe.returncode!=0:
print("Command: {}\nFailed: {}\nExit Code: {}".format(cmd,cmderr,cmd_pipe.returncode))
exit(1)
exit(0)
cmd=["ceph","osd","lspools","--format=json"]
cmd_pipe = Popen(cmd,stdout=PIPE,stderr=PIPE)
cmdout,cmderr = cmd_pipe.communicate()
if cmd_pipe.returncode!=0:
print("Command: {}\nFailed: {}\nExit Code: {}".format(cmd,cmderr,cmd_pipe.returncode))
exit(1)
poolobj=json.load(StringIO(cmdout))
pools={}
for pool in poolobj:
pools[pool['poolnum']]=pool['poolname']
needrepair=0
for incons in obj['inconsistents']:
for error in incons['errors']:
if re.match(unsaferegex,error) is not None:
print("Unsafe to repair omap mismatch with this script. Exiting to preserve data.")
exit(2)
objname=incons['object']['name']
snap=incons['object']['snap']
poolid=-1
if type(incons['selected_object_info']) in [unicode,str]:
fnamedata=collonregex.split(incons['selected_object_info'])
fname="{}__{}_*_{}".format(fnamedata[2],snap,fnamedata[0])
poolid=fnamedata[0]
else:
fname="{}__{}_*_{}".format(incons['selected_object_info']['oid']['oid'],snap,incons['selected_object_info']['oid']['pool'])
poolid=incons['selected_object_info']['oid']['pool']
print("Object: {}".format(objname))
print("Pool ID: {}".format(poolid))
print("Filename: {}".format(fname))
goodosd=parse_shards(incons['shards'])
if debug:
print("NODENAME=$(ceph osd find {} | awk '/\"host\"/ {{gsub(/[\",]/,\"\",$NF); print $NF}}')".format(goodosd))
print("ssh ${{NODENAME}} 'find /var/lib/ceph/osd/ceph-{0:d}/current/{1:s}_head/ -name {2:s} -exec cp -v {{}} /tmp/ \;'".format(goodosd,pgid,fname))
print("scp ${{NODENAME}}:/tmp/{0:s} /tmp".format(fname))
print("rados put -p {0:s} {1:s} /tmp/{2:s}\n".format(pools[int(poolid)],objname,fname))
else:
## Identify OSD host to find/copy object from
print("Finding osd {}".format(goodosd))
cmd=["ceph","osd","find",str(goodosd)]
cmd_pipe = Popen(cmd,stdout=PIPE,stderr=PIPE)
cmdout,cmderr = cmd_pipe.communicate()
if cmd_pipe.returncode!=0:
print("Command: {}\nFailed: {}\nExit Code: {}".format(cmd,cmderr,cmd_pipe.returncode))
continue
cmd_obj=json.load(StringIO(cmdout))
osdhost=cmd_obj["crush_location"]["host"]
print("Finding object {} on OSD {} from host {}".format(objname,goodosd,osdhost))
cmd=["ssh",osdhost,"find /var/lib/ceph/osd/ceph-{}/current/{}_head/ -name {} -exec cp -v {{}} /tmp/ \;".format(goodosd,pgid,fname)]
cmd_pipe = Popen(cmd,stdout=PIPE,stderr=PIPE)
cmdout,cmderr = cmd_pipe.communicate()
if cmd_pipe.returncode!=0:
print("Command: {}\nFailed: {}\nExit Code: {}".format(cmd,cmderr,cmd_pipe.returncode))
continue
print("Copying {} from {}".format(fname,goodosd))
cmd=["scp","{}:/tmp/{}".format(osdhost,fname),"/tmp/"]
cmd_pipe = Popen(cmd,stdout=PIPE,stderr=PIPE)
cmdout,cmderr = cmd_pipe.communicate()
if cmd_pipe.returncode!=0:
print("Command: {}\nFailed: {}\nExit Code: {}".format(cmd,cmderr,cmd_pipe.returncode))
continue
print("Performing rados put of object {} from file {} into pool {}".format(objname,fname,pools[int(poolid)]))
cmd=["rados","-p",pools[int(poolid)],"put",objname,"/tmp/{}".format(fname)]
cmd_pipe = Popen(cmd,stdout=PIPE,stderr=PIPE)
cmdout,cmderr = cmd_pipe.communicate()
if cmd_pipe.returncode!=0:
print("Command: {}\nFailed: {}\nExit Code: {}".format(cmd,cmderr,cmd_pipe.returncode))
continue
needrepair=1
print("PG {} object {} found, copied, put".format(pgid,objname))
if needrepair==1:
print("Issuing repair for pg {} ".format(pgid))
cmd=["ceph","pg","repair",pgid]
cmd_pipe = Popen(cmd,stdout=PIPE,stderr=PIPE)
cmdout,cmderr = cmd_pipe.communicate()
if cmd_pipe.returncode!=0:
print("Command: {}\nFailed: {}\nExit Code: {}".format(cmd,cmderr,cmd_pipe.returncode))
exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment