CVSROOT: /web/www Module name: www Changes by: Waclaw Jacek <wwj> 11/09/18 21:36:18
Modified files: server/source/linc: linc.py Log message: Updating source code of LINC CVSWeb URLs: http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/linc/linc.py?cvsroot=www&r1=1.1&r2=1.2 Patches: Index: linc.py =================================================================== RCS file: /web/www/www/server/source/linc/linc.py,v retrieving revision 1.1 retrieving revision 1.2 diff -u -b -r1.1 -r1.2 --- linc.py 14 Mar 2011 18:44:23 -0000 1.1 +++ linc.py 18 Sep 2011 21:34:56 -0000 1.2 @@ -15,28 +15,32 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. -# -# --------------------------------------------------------------------- -# -# Command format: "python checklinks.py <local_directory_containing_website> <corresponding_address_on_web_server>" # defines +BASE_DIRECTORY = '/home/w/wwj/www-repo/' +REMOTE_BASE_DIRECTORY = 'http://www.gnu.org/' + +ADDITIONAL_HTTP_HEADERS = 'User-Agent: LINC/alpha\r\nAccept: text/html, text/plain, audio/mod, image/*, application/msword, application/pdf, application/postscript, text/sgml, video/mpeg, */*;q=0.01\r\n' # end every header with "\r\n" DELAY_BETWEEN_CHECKS = 1 # In seconds. Set to 0 to disable delay between checks of different links. DELAY_BETWEEN_RETRIES = 10 # In seconds. Used when a link fails before re-checking it. Set to 0 to disable delays. -NUMBER_OF_ATTEMPTS = 3 # Set to 1 for the program to check links just once, without retrying in case of failure. 0 makes the program not even check the link at all, so it is not the most recommended of values. +FORWARDS_TO_FOLLOW = 5 # How many forwards should be followed. +NUMBER_OF_ATTEMPTS = 3 # Number of times to check a link for error. If an attempt is successful, the link is no longer checked during that program run. REPORT_FILE_NAME = 'reports-temp/broken_links' # Path to the file to which the errors will be reported. SEPARATE_TRANSLATION_REPORTS = True # If you set this to True, reports for translations will be saved into "REPORT_FILE_NAME.lang" instead of in the main report file. SKIP_TRANSLATION_FILES = False # If you set this to True, files with translations will be skipped. SOCKET_TIMEOUT = 20 # After what time to give up with trying to retrieve a website. -EXCLUDED_DIRECTORIES_REGEXP = '(wwwes$|wwwin$|education/fr|software/[^/]+$)' # Matching directories will not be entered to check their files or subdirectories. -EXCLUDED_FILENAMES_REGEXP = '\.po$' # Matching files will be ignored. +# regexp-related defines + +EXCLUDED_DIRECTORIES_REGEXP = '^(japan|wwwes|wwwin|education/fr|software/[^/]+)$' # Matching directories will not be entered to check their files or subdirectories. +FILENAMES_TO_CHECK_REGEXP = '\.html$' # Only matching files will be checked. FTP_LINK_REGEXP = 'ftp://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?' HTTP_ERROR_HEADER = '^HTTP/1\.1 (?P<http_error_code>403|404) ' # What to treat as a HTTP error header. +HTTP_FORWARD_HEADER = '^HTTP/1\.1 (301 Moved Permanently|302 Found)$' HTTP_LINK_REGEXP = 'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?' -LINK_REGEXP = '<a( .+?)? href="([^mailto:].+?)"( .+?)?>' -LINK_REGEXP_GROUP = 1 # Number of the group that matters in the above regexp. +HTTP_NEW_LOCATION_HEADER = '^Location: (?P<new_location>.+)$' +LINK_REGEXP = '<a( .+?)? href="(?P<link>[^mailto:].+?)"( .+?)?>' TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})\.[^.]+$' # libraries @@ -76,7 +80,10 @@ socketfd.close() return None -def get_http_link_error( link ): +def get_http_link_error( link, forwarded_from = None ): # forwarded_from is either None or a list + if forwarded_from == None: + forwarded_from = [] + connection_data = re.search( HTTP_LINK_REGEXP, link ) if connection_data: hostname = connection_data.group( 'hostname' ) @@ -97,7 +104,7 @@ if resource == None: resource = '/' - socketfd.send( 'GET ' + resource + ' HTTP/1.1\r\nHost: ' + hostname + '\r\n\r\n' ) + socketfd.send( 'GET ' + resource + ' HTTP/1.1\r\nHost: ' + hostname + '\r\n' + ADDITIONAL_HTTP_HEADERS + '\r\n' ) webpage = socket_read( socketfd ) if webpage == None: @@ -112,11 +119,44 @@ header_lines = webpage[ : end_of_headers_pos ] header_lines = header_lines.split( '\r\n' ) - for header_line in header_lines: - match = re.search( HTTP_ERROR_HEADER, header_line ) + + # search for errors + match = regexp_search_list( HTTP_ERROR_HEADER, header_lines ) if match: http_error_code = match.group( 'http_error_code' ) return 'http error ' + http_error_code + ' returned by server' + + # look for forwards + match = regexp_search_list( HTTP_FORWARD_HEADER, header_lines ) + if match: + if len( forwarded_from ) < FORWARDS_TO_FOLLOW: # if we haven't been forwarded too many times yet... + match = regexp_search_list( HTTP_NEW_LOCATION_HEADER, header_lines ) + if match: + forwarded_from.append( link ) + new_location = match.group( 'new_location' ) + if new_location in forwarded_from: + return 'forward loop!' + else: + return get_http_link_error( new_location, forwarded_from ) + else: # we've been forwarded too many times, sorry. + return 'too many forwards (over ' + str( len( forwarded_from ) ) + ')' + + return None + +def is_match_inside_comment_block( regexp_match ): + haystack = regexp_match.string + match_pos = regexp_match.start() + comment_block_start = haystack.rfind( '<!--', 0, match_pos ) + comment_block_end = haystack.rfind( '-->', 0, match_pos ) + if comment_block_start > comment_block_end: + return True + return False + +def regexp_search_list( regexp, the_list ): + for list_element in the_list: + match = re.search( regexp, list_element ) + if match: + return match return None def search_directory_for_files( base_directory, directory ): @@ -129,7 +169,7 @@ search_directory_for_files( base_directory, relative_path_to_element ) else: - if re.search( EXCLUDED_FILENAMES_REGEXP, element_name ): + if not re.search( FILENAMES_TO_CHECK_REGEXP, element_name ): continue if ( SKIP_TRANSLATION_FILES == True ) and re.search( TRANSLATION_REGEXP, element_name ): @@ -162,13 +202,8 @@ ### OK, main program below. -# check if a path has been provided -if len( sys.argv ) < 3: - print 'Please run the program with the following arguments: the directory which should be checked and its corresponding URL on the live site (so eg.: ./linc.py gnucvs/software/ http://www.gnu.org/software/)' - sys.exit( 1 ) - -base_directory = sys.argv[1] -remote_base_directory = sys.argv[2] +base_directory = BASE_DIRECTORY +remote_base_directory = REMOTE_BASE_DIRECTORY if remote_base_directory[-1] != '/': remote_base_directory += '/' @@ -176,7 +211,7 @@ # `cd` to this path if not os.path.isdir( base_directory ): - print 'Selected path isn\'t a directory (or doesn\'t exist at all).' + print 'Entered base directory isn\'t a directory (or doesn\'t exist at all).' sys.exit( 1 ) # list files @@ -192,8 +227,8 @@ file_contents = fd.read() fd.close() - for match in re.findall( LINK_REGEXP, file_contents ): - link = match[LINK_REGEXP_GROUP] + for match in re.finditer( LINK_REGEXP, file_contents ): + link = match.group( 'link' ) line_number = -1 split_file_contents = file_contents.split( '\n' ) @@ -203,7 +238,7 @@ line_number = checked_line_number break - link_container = { 'filename': file_to_check, 'line_number': line_number, 'link': link } + link_container = { 'filename': file_to_check, 'line_number': line_number, 'link': link, 'is_link_inside_comment_block': is_match_inside_comment_block( match ) } links_to_check.append( link_container ) # check links @@ -223,6 +258,7 @@ filename = link_container['filename'] line_number = link_container['line_number'] link = link_container['link'] + is_link_inside_comment_block = link_container['is_link_inside_comment_block'] link_type = None @@ -278,6 +314,9 @@ already_checked_links.append( { 'link': link, 'error': link_error } ) if link_error != None: + if is_link_inside_comment_block: + link_error += ' (link commented out)' + if not SKIP_TRANSLATION_FILES: match = re.search( TRANSLATION_REGEXP, filename ) if match and SEPARATE_TRANSLATION_REPORTS: