rmetzger commented on a change in pull request #15883:
URL: https://github.com/apache/flink/pull/15883#discussion_r629459041



##########
File path: 
tools/ci/java-ci-tools/src/main/java/org/apache/flink/tools/ci/licensecheck/JarFileChecker.java
##########
@@ -134,6 +140,91 @@ private static boolean licenseFileExistsAndIsValid(Path 
licenseFile, Path jar)
         return true;
     }
 
+    private static int getFilesWithIncompatibleLicenses(Path jar, Path 
jarRoot) throws IOException {
+        return findNonBinaryFilesContainingText(
+                jar,
+                jarRoot,
+                asPatterns(
+                        "GNU Lesser General Public License",
+                        "GNU General Public License",
+                        "GPL", // also detects LGPL
+                        "GNU Affero General Public License",
+                        "Amazon Software License",
+                        "Confluent Community License Agreement Version 1.0",
+                        "Don’t be evil") // can sometimes be found in "funny" 
licenses
+                );
+    }
+
+    private static Collection<Pattern> asPatterns(String... texts) {
+        return Stream.of(texts)
+                .map(JarFileChecker::asPatternWithPotentialLineBreaks)
+                .collect(Collectors.toList());
+    }
+
+    private static Pattern asPatternWithPotentialLineBreaks(String text) {
+        return Pattern.compile(text.toLowerCase(Locale.ROOT).replaceAll(" ", " 
?\\\\R?[\\\\s/#]*"));
+    }
+
+    private static int findNonBinaryFilesContainingText(
+            Path jar, Path jarRoot, Collection<Pattern> forbidden) throws 
IOException {
+        try (Stream<Path> files = Files.walk(jarRoot)) {
+            return files.filter(path -> !path.equals(jarRoot))
+                    .filter(path -> !Files.isDirectory(path))
+                    .filter(JarFileChecker::isNoClassFile)
+                    // frequent false-positives due to dual-licensing; 
generated by maven
+                    .filter(path -> !getFileName(path).equals("dependencies"))
+                    // false-positives due to dual-licensing; use startsWith 
to cover .txt/.md files
+                    .filter(path -> !getFileName(path).startsWith("license"))
+                    // false-positives due to optional components; startsWith 
covers .txt/.md files
+                    .filter(path -> !getFileName(path).startsWith("notice"))
+                    // dual-licensed under GPL 2 and CDDL 1.1
+                    // contained in hadoop/presto S3 FS and flink-dist
+                    .filter(path -> !pathStartsWith(path, 
"/META-INF/versions/11/javax/xml/bind"))
+                    .filter(path -> !(isJavaxManifest(jar, path)))
+                    // dual-licensed under GPL 2 and EPL 2.0
+                    // contained in sql-avro-confluent-registry
+                    .filter(path -> !pathStartsWith(path, 
"/org/glassfish/jersey/internal"))
+                    .map(
+                            path -> {
+                                try {
+                                    final String fileContents;
+                                    try {
+                                        fileContents =
+                                                Files.readString(path, 
StandardCharsets.UTF_8)
+                                                        
.toLowerCase(Locale.ROOT);

Review comment:
       Seems like you are living in the future

##########
File path: 
tools/ci/java-ci-tools/src/main/java/org/apache/flink/tools/ci/licensecheck/JarFileChecker.java
##########
@@ -188,4 +276,24 @@ private static int 
getNumLicenseFilesOutsideMetaInfDirectory(Path jar, Path jarR
     private static String getFileName(Path path) {
         return path.getFileName().toString().toLowerCase();
     }
+
+    private static boolean pathStartsWith(Path file, String path) {
+        return file.startsWith(file.getFileSystem().getPath(path));
+    }
+
+    private static boolean equals(Path file, String path) {
+        return file.equals(file.getFileSystem().getPath(path));
+    }
+
+    private static boolean isNoClassFile(Path file) {
+        return !getFileName(file).endsWith(".class");
+    }
+
+    private static boolean isJavaxManifest(Path jar, Path 
potentialManifestFile) {
+        final String jarFileName = getFileName(jar);
+
+        return (jarFileName.startsWith("flink-s3-fs-hadoop")
+                        || jarFileName.startsWith("flink-s3-fs-presto"))

Review comment:
       What have the s3 modules to do with the Javax manifast check?

##########
File path: 
tools/ci/java-ci-tools/src/main/java/org/apache/flink/tools/ci/licensecheck/JarFileChecker.java
##########
@@ -134,6 +140,91 @@ private static boolean licenseFileExistsAndIsValid(Path 
licenseFile, Path jar)
         return true;
     }
 
+    private static int getFilesWithIncompatibleLicenses(Path jar, Path 
jarRoot) throws IOException {
+        return findNonBinaryFilesContainingText(
+                jar,
+                jarRoot,
+                asPatterns(
+                        "GNU Lesser General Public License",
+                        "GNU General Public License",
+                        "GPL", // also detects LGPL
+                        "GNU Affero General Public License",
+                        "Amazon Software License",
+                        "Confluent Community License Agreement Version 1.0",
+                        "Don’t be evil") // can sometimes be found in "funny" 
licenses
+                );
+    }
+
+    private static Collection<Pattern> asPatterns(String... texts) {
+        return Stream.of(texts)
+                .map(JarFileChecker::asPatternWithPotentialLineBreaks)
+                .collect(Collectors.toList());
+    }
+
+    private static Pattern asPatternWithPotentialLineBreaks(String text) {
+        return Pattern.compile(text.toLowerCase(Locale.ROOT).replaceAll(" ", " 
?\\\\R?[\\\\s/#]*"));
+    }
+
+    private static int findNonBinaryFilesContainingText(
+            Path jar, Path jarRoot, Collection<Pattern> forbidden) throws 
IOException {
+        try (Stream<Path> files = Files.walk(jarRoot)) {
+            return files.filter(path -> !path.equals(jarRoot))
+                    .filter(path -> !Files.isDirectory(path))
+                    .filter(JarFileChecker::isNoClassFile)
+                    // frequent false-positives due to dual-licensing; 
generated by maven
+                    .filter(path -> !getFileName(path).equals("dependencies"))
+                    // false-positives due to dual-licensing; use startsWith 
to cover .txt/.md files
+                    .filter(path -> !getFileName(path).startsWith("license"))
+                    // false-positives due to optional components; startsWith 
covers .txt/.md files
+                    .filter(path -> !getFileName(path).startsWith("notice"))
+                    // dual-licensed under GPL 2 and CDDL 1.1
+                    // contained in hadoop/presto S3 FS and flink-dist
+                    .filter(path -> !pathStartsWith(path, 
"/META-INF/versions/11/javax/xml/bind"))
+                    .filter(path -> !(isJavaxManifest(jar, path)))

Review comment:
       ```suggestion
                       .filter(path -> !isJavaxManifest(jar, path))
   ```

##########
File path: 
tools/ci/java-ci-tools/src/main/java/org/apache/flink/tools/ci/licensecheck/JarFileChecker.java
##########
@@ -134,6 +140,91 @@ private static boolean licenseFileExistsAndIsValid(Path 
licenseFile, Path jar)
         return true;
     }
 
+    private static int getFilesWithIncompatibleLicenses(Path jar, Path 
jarRoot) throws IOException {
+        return findNonBinaryFilesContainingText(
+                jar,
+                jarRoot,
+                asPatterns(
+                        "GNU Lesser General Public License",
+                        "GNU General Public License",
+                        "GPL", // also detects LGPL
+                        "GNU Affero General Public License",
+                        "Amazon Software License",
+                        "Confluent Community License Agreement Version 1.0",
+                        "Don’t be evil") // can sometimes be found in "funny" 
licenses

Review comment:
       I wonder whether we should put all licenses listed here 
https://www.apache.org/legal/resolved.html#category-x as patterns?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to