Skip to content

Commit c0979fd

Browse files
authored
Checks for problems in Accumulo (#4957)
* Checks for problems in Accumulo This partially completes #4892: - Moves existing checks (`checkTablets` and the fate check for dangling locks) into the appropriate new `admin check` command - Adds new checks - New tests in AdminCheckIT - Created new check TABLE_LOCKS which checks for - valid locked table/namespace ids (the locked table/namespaces exist) - locked table/namespaces are associated with a fate op - ROOT_METADATA now checks for - offline tablets - missing "columns" - invalid "columns" - ROOT_TABLE now checks for - offline tablets - tablets for metadata table have no holes, valid (null) prev end row for first tablet, and valid (null) end row for last tablet - missing columns - invalid columns - METADATA_TABLE now checks for - offline tablets - tablets for user tables (and scanref) have no holes, valid (null) prev end row for first tablet, and valid (null) end row for last tablet - missing columns - invalid columns - SYSTEM_FILES now checks for - missing system files - USER_FILES now checks for - missing user files
1 parent 96c4582 commit c0979fd

18 files changed

+941
-188
lines changed

core/src/main/java/org/apache/accumulo/core/fate/AdminUtil.java

-12
Original file line numberDiff line numberDiff line change
@@ -424,18 +424,6 @@ public void print(ReadOnlyTStore<T> zs, ZooReader zk, ServiceLock.ServiceLockPat
424424
txStatus.getWaitingLocks(), txStatus.getTop(), txStatus.getTimeCreatedFormatted());
425425
}
426426
fmt.format(" %s transactions", fateStatus.getTransactions().size());
427-
428-
if (!fateStatus.getDanglingHeldLocks().isEmpty()
429-
|| !fateStatus.getDanglingWaitingLocks().isEmpty()) {
430-
fmt.format("%nThe following locks did not have an associated FATE operation%n");
431-
for (Entry<String,List<String>> entry : fateStatus.getDanglingHeldLocks().entrySet()) {
432-
fmt.format("txid: %s locked: %s%n", entry.getKey(), entry.getValue());
433-
}
434-
435-
for (Entry<String,List<String>> entry : fateStatus.getDanglingWaitingLocks().entrySet()) {
436-
fmt.format("txid: %s locking: %s%n", entry.getKey(), entry.getValue());
437-
}
438-
}
439427
}
440428

441429
public boolean prepDelete(TStore<T> zs, ZooReaderWriter zk, ServiceLockPath path,

core/src/main/java/org/apache/accumulo/core/iterators/user/WholeRowIterator.java

+7-3
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ private static byte[] readField(DataInputStream din) throws IOException {
8686
return b;
8787
}
8888

89-
// decode a bunch of key value pairs that have been encoded into a single value
89+
/**
90+
* decode a bunch of key value pairs that have been encoded into a single value
91+
*/
9092
public static final SortedMap<Key,Value> decodeRow(Key rowKey, Value rowValue)
9193
throws IOException {
9294
SortedMap<Key,Value> map = new TreeMap<>();
@@ -110,8 +112,10 @@ private static void encode(DataOutputStream dout, ByteSequence bs) throws IOExce
110112
dout.write(bs.getBackingArray(), bs.offset(), bs.length());
111113
}
112114

113-
// take a stream of keys and values and output a value that encodes everything but their row
114-
// keys and values must be paired one for one
115+
/**
116+
* take a stream of keys and values and output a value that encodes everything but their row keys
117+
* and values must be paired one for one
118+
*/
115119
public static final Value encodeRow(List<Key> keys, List<Value> values) throws IOException {
116120
ByteArrayOutputStream out = new ByteArrayOutputStream();
117121
DataOutputStream dout = new DataOutputStream(out);

server/base/src/main/java/org/apache/accumulo/server/util/Admin.java

+21-43
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
import org.apache.accumulo.server.util.checkCommand.RootTableCheckRunner;
8787
import org.apache.accumulo.server.util.checkCommand.SystemConfigCheckRunner;
8888
import org.apache.accumulo.server.util.checkCommand.SystemFilesCheckRunner;
89+
import org.apache.accumulo.server.util.checkCommand.TableLocksCheckRunner;
8990
import org.apache.accumulo.server.util.checkCommand.UserFilesCheckRunner;
9091
import org.apache.accumulo.server.util.fateCommand.FateSummaryReport;
9192
import org.apache.accumulo.start.spi.KeywordExecutable;
@@ -145,6 +146,10 @@ public static class CheckCommand {
145146
@Parameter(description = "[<Check>...]")
146147
List<String> checks;
147148

149+
@Parameter(names = "--fixFiles", description = "Removes dangling file pointers. Used by the "
150+
+ "USER_FILES and SYSTEM_FILES checks.")
151+
boolean fixFiles = false;
152+
148153
/**
149154
* This should be used to get the check runner instead of {@link Check#getCheckRunner()}. This
150155
* exists so that its functionality can be changed for testing.
@@ -159,6 +164,9 @@ public enum Check {
159164
// Caution should be taken when changing or adding any new checks: order is important
160165
SYSTEM_CONFIG(SystemConfigCheckRunner::new, "Validate the system config stored in ZooKeeper",
161166
Collections.emptyList()),
167+
TABLE_LOCKS(TableLocksCheckRunner::new,
168+
"Ensures that table and namespace locks are valid and are associated with a FATE op",
169+
Collections.singletonList(SYSTEM_CONFIG)),
162170
ROOT_METADATA(RootMetadataCheckRunner::new,
163171
"Checks integrity of the root tablet metadata stored in ZooKeeper",
164172
Collections.singletonList(SYSTEM_CONFIG)),
@@ -214,16 +222,6 @@ public enum CheckStatus {
214222
}
215223
}
216224

217-
@Parameters(commandDescription = "print tablets that are offline in online tables")
218-
static class CheckTabletsCommand {
219-
@Parameter(names = "--fixFiles", description = "Remove dangling file pointers")
220-
boolean fixFiles = false;
221-
222-
@Parameter(names = {"-t", "--table"},
223-
description = "Table to check, if not set checks all tables")
224-
String tableName = null;
225-
}
226-
227225
@Parameters(commandDescription = "stop the manager")
228226
static class StopManagerCommand {}
229227

@@ -377,9 +375,6 @@ public void execute(final String[] args) {
377375
CheckCommand checkCommand = new CheckCommand();
378376
cl.addCommand("check", checkCommand);
379377

380-
CheckTabletsCommand checkTabletsCommand = new CheckTabletsCommand();
381-
cl.addCommand("checkTablets", checkTabletsCommand);
382-
383378
DeleteZooInstanceCommand deleteZooInstOpts = new DeleteZooInstanceCommand();
384379
cl.addCommand("deleteZooInstance", deleteZooInstOpts);
385380

@@ -441,24 +436,6 @@ public void execute(final String[] args) {
441436
if (ping(context, pingCommand.args) != 0) {
442437
rc = 4;
443438
}
444-
} else if (cl.getParsedCommand().equals("checkTablets")) {
445-
System.out.println("\n*** Looking for offline tablets ***\n");
446-
if (FindOfflineTablets.findOffline(context, checkTabletsCommand.tableName) != 0) {
447-
rc = 5;
448-
}
449-
System.out.println("\n*** Looking for missing files ***\n");
450-
if (checkTabletsCommand.tableName == null) {
451-
if (RemoveEntriesForMissingFiles.checkAllTables(context, checkTabletsCommand.fixFiles)
452-
!= 0) {
453-
rc = 6;
454-
}
455-
} else {
456-
if (RemoveEntriesForMissingFiles.checkTable(context, checkTabletsCommand.tableName,
457-
checkTabletsCommand.fixFiles) != 0) {
458-
rc = 6;
459-
}
460-
}
461-
462439
} else if (cl.getParsedCommand().equals("stop")) {
463440
stopTabletServer(context, stopOpts.args, opts.force);
464441
} else if (cl.getParsedCommand().equals("dumpConfig")) {
@@ -482,7 +459,7 @@ public void execute(final String[] args) {
482459
} else if (cl.getParsedCommand().equals("serviceStatus")) {
483460
printServiceStatus(context, serviceStatusCommandOpts);
484461
} else if (cl.getParsedCommand().equals("check")) {
485-
executeCheckCommand(context, checkCommand);
462+
executeCheckCommand(context, checkCommand, opts);
486463
} else {
487464
everything = cl.getParsedCommand().equals("stopAll");
488465

@@ -1012,15 +989,16 @@ private EnumSet<ReadOnlyTStore.TStatus> getCmdLineStatusFilters(List<String> sta
1012989
}
1013990

1014991
@VisibleForTesting
1015-
public static void executeCheckCommand(ServerContext context, CheckCommand cmd) {
992+
public static void executeCheckCommand(ServerContext context, CheckCommand cmd,
993+
ServerUtilOpts opts) throws Exception {
1016994
validateAndTransformCheckCommand(cmd);
1017995

1018996
if (cmd.list) {
1019997
listChecks();
1020998
} else if (cmd.run) {
1021-
var givenChecks =
1022-
cmd.checks.stream().map(CheckCommand.Check::valueOf).collect(Collectors.toList());
1023-
executeRunCheckCommand(cmd, givenChecks);
999+
var givenChecks = cmd.checks.stream()
1000+
.map(name -> CheckCommand.Check.valueOf(name.toUpperCase())).collect(Collectors.toList());
1001+
executeRunCheckCommand(cmd, givenChecks, context, opts);
10241002
}
10251003
}
10261004

@@ -1051,19 +1029,19 @@ private static void validateAndTransformCheckCommand(CheckCommand cmd) {
10511029

10521030
private static void listChecks() {
10531031
System.out.println();
1054-
System.out.printf("%-20s | %-80s | %-20s%n", "Check Name", "Description", "Depends on");
1055-
System.out.println("-".repeat(120));
1032+
System.out.printf("%-20s | %-90s | %-20s%n", "Check Name", "Description", "Depends on");
1033+
System.out.println("-".repeat(130));
10561034
for (CheckCommand.Check check : CheckCommand.Check.values()) {
1057-
System.out.printf("%-20s | %-80s | %-20s%n", check.name(), check.getDescription(),
1035+
System.out.printf("%-20s | %-90s | %-20s%n", check.name(), check.getDescription(),
10581036
check.getDependencies().stream().map(CheckCommand.Check::name)
10591037
.collect(Collectors.joining(", ")));
10601038
}
1061-
System.out.println("-".repeat(120));
1039+
System.out.println("-".repeat(130));
10621040
System.out.println();
10631041
}
10641042

1065-
private static void executeRunCheckCommand(CheckCommand cmd,
1066-
List<CheckCommand.Check> givenChecks) {
1043+
private static void executeRunCheckCommand(CheckCommand cmd, List<CheckCommand.Check> givenChecks,
1044+
ServerContext context, ServerUtilOpts opts) throws Exception {
10671045
// Get all the checks in the order they are declared in the enum
10681046
final var allChecks = CheckCommand.Check.values();
10691047
final TreeMap<CheckCommand.Check,CheckCommand.CheckStatus> checkStatus = new TreeMap<>();
@@ -1073,7 +1051,7 @@ private static void executeRunCheckCommand(CheckCommand cmd,
10731051
checkStatus.put(check, CheckCommand.CheckStatus.SKIPPED_DEPENDENCY_FAILED);
10741052
} else {
10751053
if (givenChecks.contains(check)) {
1076-
checkStatus.put(check, cmd.getCheckRunner(check).runCheck());
1054+
checkStatus.put(check, cmd.getCheckRunner(check).runCheck(context, opts, cmd.fixFiles));
10771055
} else {
10781056
checkStatus.put(check, CheckCommand.CheckStatus.FILTERED_OUT);
10791057
}

server/base/src/main/java/org/apache/accumulo/server/util/CheckForMetadataProblems.java

+57-37
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.util.Map;
2424
import java.util.Map.Entry;
2525
import java.util.TreeSet;
26+
import java.util.function.Consumer;
2627

2728
import org.apache.accumulo.core.client.Accumulo;
2829
import org.apache.accumulo.core.client.AccumuloClient;
@@ -45,75 +46,83 @@
4546
import io.opentelemetry.context.Scope;
4647

4748
public class CheckForMetadataProblems {
48-
private static boolean sawProblems = false;
49-
private static ServerUtilOpts opts;
5049

51-
private static void checkTable(TableId tableId, TreeSet<KeyExtent> tablets) {
50+
private static boolean checkTable(TableId tableId, TreeSet<KeyExtent> tablets,
51+
ServerUtilOpts opts, Consumer<String> printInfoMethod, Consumer<String> printProblemMethod) {
5252
// sanity check of metadata table entries
53-
// make sure tablets has no holes, and that it starts and ends w/ null
53+
// make sure tablets have no holes, and that it starts and ends w/ null
5454
String tableName;
55+
boolean sawProblems = false;
5556

5657
try {
5758
tableName = opts.getServerContext().getTableName(tableId);
5859
} catch (TableNotFoundException e) {
5960
tableName = null;
6061
}
6162

63+
printInfoMethod.accept(String.format("Ensuring tablets for table %s (%s) have: no holes, "
64+
+ "valid (null) prev end row for first tablet, and valid (null) end row "
65+
+ "for last tablet...\n", tableName, tableId));
66+
6267
if (tablets.isEmpty()) {
63-
System.out.println(
64-
"...No entries found in metadata table for table " + tableName + " (" + tableId + ")");
65-
sawProblems = true;
66-
return;
68+
printProblemMethod.accept(String
69+
.format("...No entries found in metadata table for table %s (%s)", tableName, tableId));
70+
return true;
6771
}
6872

6973
if (tablets.first().prevEndRow() != null) {
70-
System.out.println("...First entry for table " + tableName + " (" + tableId + ") - "
71-
+ tablets.first() + " - has non null prev end row");
72-
sawProblems = true;
73-
return;
74+
printProblemMethod
75+
.accept(String.format("...First entry for table %s (%s) - %s - has non-null prev end row",
76+
tableName, tableId, tablets.first()));
77+
return true;
7478
}
7579

7680
if (tablets.last().endRow() != null) {
77-
System.out.println("...Last entry for table " + tableName + " (" + tableId + ") - "
78-
+ tablets.last() + " - has non null end row");
79-
sawProblems = true;
80-
return;
81+
printProblemMethod
82+
.accept(String.format("...Last entry for table %s (%s) - %s - has non-null end row",
83+
tableName, tableId, tablets.last()));
84+
return true;
8185
}
8286

8387
Iterator<KeyExtent> tabIter = tablets.iterator();
8488
Text lastEndRow = tabIter.next().endRow();
8589
boolean everythingLooksGood = true;
8690
while (tabIter.hasNext()) {
87-
KeyExtent tabke = tabIter.next();
91+
KeyExtent table = tabIter.next();
8892
boolean broke = false;
89-
if (tabke.prevEndRow() == null) {
90-
System.out.println("...Table " + tableName + " (" + tableId
91-
+ ") has null prev end row in middle of table " + tabke);
93+
if (table.prevEndRow() == null) {
94+
printProblemMethod
95+
.accept(String.format("...Table %s (%s) has null prev end row in middle of table %s",
96+
tableName, tableId, table));
9297
broke = true;
93-
} else if (!tabke.prevEndRow().equals(lastEndRow)) {
94-
System.out.println("...Table " + tableName + " (" + tableId + ") has a hole "
95-
+ tabke.prevEndRow() + " != " + lastEndRow);
98+
} else if (!table.prevEndRow().equals(lastEndRow)) {
99+
printProblemMethod.accept(String.format("...Table %s (%s) has a hole %s != %s", tableName,
100+
tableId, table.prevEndRow(), lastEndRow));
96101
broke = true;
97102
}
98103
if (broke) {
99104
everythingLooksGood = false;
100105
}
101106

102-
lastEndRow = tabke.endRow();
107+
lastEndRow = table.endRow();
103108
}
104109
if (everythingLooksGood) {
105-
System.out.println("...All is well for table " + tableName + " (" + tableId + ")");
110+
printInfoMethod.accept(String.format("...All is well for table %s (%s)", tableName, tableId));
106111
} else {
107112
sawProblems = true;
108113
}
114+
115+
return sawProblems;
109116
}
110117

111-
private static void checkMetadataAndRootTableEntries(String tableNameToCheck, ServerUtilOpts opts)
118+
public static boolean checkMetadataAndRootTableEntries(String tableNameToCheck,
119+
ServerUtilOpts opts, Consumer<String> printInfoMethod, Consumer<String> printProblemMethod)
112120
throws Exception {
113121
TableId tableCheckId = opts.getServerContext().getTableId(tableNameToCheck);
114-
System.out.println("Checking tables whose metadata is found in: " + tableNameToCheck + " ("
115-
+ tableCheckId + ")");
122+
printInfoMethod.accept(String.format("Checking tables whose metadata is found in: %s (%s)...\n",
123+
tableNameToCheck, tableCheckId));
116124
Map<TableId,TreeSet<KeyExtent>> tables = new HashMap<>();
125+
boolean sawProblems = false;
117126

118127
try (AccumuloClient client = Accumulo.newClient().from(opts.getClientProps()).build();
119128
Scanner scanner = client.createScanner(tableNameToCheck, Authorizations.EMPTY)) {
@@ -139,7 +148,10 @@ private static void checkMetadataAndRootTableEntries(String tableNameToCheck, Se
139148
TreeSet<KeyExtent> tablets = tables.get(tableId);
140149
if (tablets == null) {
141150

142-
tables.forEach(CheckForMetadataProblems::checkTable);
151+
for (var e : tables.entrySet()) {
152+
sawProblems = CheckForMetadataProblems.checkTable(e.getKey(), e.getValue(), opts,
153+
printInfoMethod, printProblemMethod) || sawProblems;
154+
}
143155

144156
tables.clear();
145157

@@ -153,37 +165,45 @@ private static void checkMetadataAndRootTableEntries(String tableNameToCheck, Se
153165
justLoc = false;
154166
} else if (colf.equals(CurrentLocationColumnFamily.NAME)) {
155167
if (justLoc) {
156-
System.out.println("Problem at key " + entry.getKey());
168+
printProblemMethod.accept("Problem at key " + entry.getKey());
157169
sawProblems = true;
158170
}
159171
justLoc = true;
160172
}
161173
}
162174

163175
if (count == 0) {
164-
System.err
165-
.println("ERROR : table " + tableNameToCheck + " (" + tableCheckId + ") is empty");
176+
printProblemMethod.accept(
177+
String.format("ERROR : table %s (%s) is empty", tableNameToCheck, tableCheckId));
166178
sawProblems = true;
167179
}
168180
}
169181

170-
tables.forEach(CheckForMetadataProblems::checkTable);
182+
for (var e : tables.entrySet()) {
183+
sawProblems = CheckForMetadataProblems.checkTable(e.getKey(), e.getValue(), opts,
184+
printInfoMethod, printProblemMethod) || sawProblems;
185+
}
171186

172187
if (!sawProblems) {
173-
System.out.println("No problems found in " + tableNameToCheck + " (" + tableCheckId + ")");
188+
printInfoMethod.accept(
189+
String.format("\n...No problems found in %s (%s)", tableNameToCheck, tableCheckId));
174190
}
175191
// end METADATA table sanity check
192+
return sawProblems;
176193
}
177194

178195
public static void main(String[] args) throws Exception {
179-
opts = new ServerUtilOpts();
196+
ServerUtilOpts opts = new ServerUtilOpts();
180197
opts.parseArgs(CheckForMetadataProblems.class.getName(), args);
181198
Span span = TraceUtil.startSpan(CheckForMetadataProblems.class, "main");
199+
boolean sawProblems;
182200
try (Scope scope = span.makeCurrent()) {
183201

184-
checkMetadataAndRootTableEntries(AccumuloTable.ROOT.tableName(), opts);
202+
sawProblems = checkMetadataAndRootTableEntries(AccumuloTable.ROOT.tableName(), opts,
203+
System.out::println, System.out::println);
185204
System.out.println();
186-
checkMetadataAndRootTableEntries(AccumuloTable.METADATA.tableName(), opts);
205+
sawProblems = checkMetadataAndRootTableEntries(AccumuloTable.METADATA.tableName(), opts,
206+
System.out::println, System.out::println) || sawProblems;
187207
if (sawProblems) {
188208
throw new IllegalStateException();
189209
}

0 commit comments

Comments
 (0)