linc linc.py

Waclaw Jacek Sun, 18 Sep 2011 14:36:27 -0700

CVSROOT:        /web/www
Module name:    www
Changes by:     Waclaw Jacek <wwj>      11/09/18 21:36:18


Modified files:
        server/source/linc: linc.py 

Log message:
        Updating source code of LINC

CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/linc/linc.py?cvsroot=www&r1=1.1&r2=1.2

Patches:
Index: linc.py
===================================================================
RCS file: /web/www/www/server/source/linc/linc.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- linc.py     14 Mar 2011 18:44:23 -0000      1.1
+++ linc.py     18 Sep 2011 21:34:56 -0000      1.2
@@ -15,28 +15,32 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-# ---------------------------------------------------------------------
-#
-# Command format: "python checklinks.py <local_directory_containing_website> 
<corresponding_address_on_web_server>"
 
 # defines
 
+BASE_DIRECTORY = '/home/w/wwj/www-repo/'
+REMOTE_BASE_DIRECTORY = 'http://www.gnu.org/'
+
+ADDITIONAL_HTTP_HEADERS = 'User-Agent: LINC/alpha\r\nAccept: text/html, 
text/plain, audio/mod, image/*, application/msword, application/pdf, 
application/postscript, text/sgml, video/mpeg, */*;q=0.01\r\n' # end every 
header with "\r\n"
 DELAY_BETWEEN_CHECKS = 1 # In seconds. Set to 0 to disable delay between 
checks of different links.
 DELAY_BETWEEN_RETRIES = 10 # In seconds. Used when a link fails before 
re-checking it. Set to 0 to disable delays.
-NUMBER_OF_ATTEMPTS = 3 # Set to 1 for the program to check links just once, 
without retrying in case of failure. 0 makes the program not even check the 
link at all, so it is not the most recommended of values.
+FORWARDS_TO_FOLLOW = 5 # How many forwards should be followed.
+NUMBER_OF_ATTEMPTS = 3 # Number of times to check a link for error. If an 
attempt is successful, the link is no longer checked during that program run.
 REPORT_FILE_NAME = 'reports-temp/broken_links' # Path to the file to which the 
errors will be reported.
 SEPARATE_TRANSLATION_REPORTS = True # If you set this to True, reports for 
translations will be saved into "REPORT_FILE_NAME.lang" instead of in the main 
report file.
 SKIP_TRANSLATION_FILES = False # If you set this to True, files with 
translations will be skipped.
 SOCKET_TIMEOUT = 20 # After what time to give up with trying to retrieve a 
website.
 
-EXCLUDED_DIRECTORIES_REGEXP = '(wwwes$|wwwin$|education/fr|software/[^/]+$)' # 
Matching directories will not be entered to check their files or subdirectories.
-EXCLUDED_FILENAMES_REGEXP = '\.po$' # Matching files will be ignored.
+# regexp-related defines
+
+EXCLUDED_DIRECTORIES_REGEXP = 
'^(japan|wwwes|wwwin|education/fr|software/[^/]+)$' # Matching directories will 
not be entered to check their files or subdirectories.
+FILENAMES_TO_CHECK_REGEXP = '\.html$' # Only matching files will be checked.
 FTP_LINK_REGEXP = 'ftp://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?'
 HTTP_ERROR_HEADER = '^HTTP/1\.1 (?P<http_error_code>403|404) ' # What to treat 
as a HTTP error header.
+HTTP_FORWARD_HEADER = '^HTTP/1\.1 (301 Moved Permanently|302 Found)$'
 HTTP_LINK_REGEXP = 
'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
-LINK_REGEXP = '<a( .+?)? href="([^mailto:].+?)"( .+?)?>'
-LINK_REGEXP_GROUP = 1 # Number of the group that matters in the above regexp.
+HTTP_NEW_LOCATION_HEADER = '^Location: (?P<new_location>.+)$'
+LINK_REGEXP = '<a( .+?)? href="(?P<link>[^mailto:].+?)"( .+?)?>'
 TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})\.[^.]+$'
 
 # libraries
@@ -76,7 +80,10 @@
                socketfd.close()
        return None
 
-def get_http_link_error( link ):
+def get_http_link_error( link, forwarded_from = None ): # forwarded_from is 
either None or a list
+       if forwarded_from == None:
+               forwarded_from = []
+
        connection_data = re.search( HTTP_LINK_REGEXP, link )
        if connection_data:
                hostname = connection_data.group( 'hostname' )
@@ -97,7 +104,7 @@
                if resource == None:
                        resource = '/'
 
-               socketfd.send( 'GET ' + resource + ' HTTP/1.1\r\nHost: ' + 
hostname + '\r\n\r\n' )
+               socketfd.send( 'GET ' + resource + ' HTTP/1.1\r\nHost: ' + 
hostname + '\r\n' + ADDITIONAL_HTTP_HEADERS + '\r\n' )
                
                webpage = socket_read( socketfd )
                if webpage == None:
@@ -112,11 +119,44 @@
                        
                header_lines = webpage[ : end_of_headers_pos ]
                header_lines = header_lines.split( '\r\n' )
-               for header_line in header_lines:
-                       match = re.search( HTTP_ERROR_HEADER, header_line )
+               
+               # search for errors
+               match = regexp_search_list( HTTP_ERROR_HEADER, header_lines )
                        if match:
                                http_error_code = match.group( 
'http_error_code' )
                                return 'http error ' + http_error_code + ' 
returned by server'
+                       
+               # look for forwards
+               match = regexp_search_list( HTTP_FORWARD_HEADER, header_lines )
+               if match:
+                       if len( forwarded_from ) < FORWARDS_TO_FOLLOW: # if we 
haven't been forwarded too many times yet...
+                               match = regexp_search_list( 
HTTP_NEW_LOCATION_HEADER, header_lines )
+                               if match:
+                                       forwarded_from.append( link )
+                                       new_location = match.group( 
'new_location' )
+                                       if new_location in forwarded_from:
+                                               return 'forward loop!'
+                                       else:
+                                               return get_http_link_error( 
new_location, forwarded_from )
+                       else: # we've been forwarded too many times, sorry.
+                               return 'too many forwards (over ' + str( len( 
forwarded_from ) ) + ')'
+
+       return None
+       
+def is_match_inside_comment_block( regexp_match ):
+       haystack = regexp_match.string
+       match_pos = regexp_match.start()
+       comment_block_start = haystack.rfind( '<!--', 0, match_pos )
+       comment_block_end = haystack.rfind( '-->', 0, match_pos )
+       if comment_block_start > comment_block_end:
+               return True
+       return False
+
+def regexp_search_list( regexp, the_list ):
+       for list_element in the_list:
+               match = re.search( regexp, list_element )
+               if match:
+                       return match
        return None
 
 def search_directory_for_files( base_directory, directory ):
@@ -129,7 +169,7 @@
                                
                        search_directory_for_files( base_directory, 
relative_path_to_element )
                else:
-                       if re.search( EXCLUDED_FILENAMES_REGEXP, element_name ):
+                       if not re.search( FILENAMES_TO_CHECK_REGEXP, 
element_name ):
                                continue
                
                        if ( SKIP_TRANSLATION_FILES == True ) and re.search( 
TRANSLATION_REGEXP, element_name ):
@@ -162,13 +202,8 @@
 
 ### OK, main program below.
 
-# check if a path has been provided
-if len( sys.argv ) < 3:
-       print 'Please run the program with the following arguments: the 
directory which should be checked and its corresponding URL on the live site 
(so eg.: ./linc.py gnucvs/software/ http://www.gnu.org/software/)'
-       sys.exit( 1 )
-       
-base_directory = sys.argv[1]
-remote_base_directory = sys.argv[2]
+base_directory = BASE_DIRECTORY
+remote_base_directory = REMOTE_BASE_DIRECTORY
 if remote_base_directory[-1] != '/':
        remote_base_directory += '/'
        
@@ -176,7 +211,7 @@
 
 # `cd` to this path
 if not os.path.isdir( base_directory ):
-       print 'Selected path isn\'t a directory (or doesn\'t exist at all).'
+       print 'Entered base directory isn\'t a directory (or doesn\'t exist at 
all).'
        sys.exit( 1 )
 
 # list files
@@ -192,8 +227,8 @@
        file_contents = fd.read()
        fd.close()
 
-       for match in re.findall( LINK_REGEXP, file_contents ):
-               link = match[LINK_REGEXP_GROUP]
+       for match in re.finditer( LINK_REGEXP, file_contents ):
+               link = match.group( 'link' )
                
                line_number = -1
                split_file_contents = file_contents.split( '\n' )
@@ -203,7 +238,7 @@
                                line_number = checked_line_number
                                break
                
-               link_container = { 'filename': file_to_check, 'line_number': 
line_number, 'link': link }
+               link_container = { 'filename': file_to_check, 'line_number': 
line_number, 'link': link, 'is_link_inside_comment_block': 
is_match_inside_comment_block( match ) }
                links_to_check.append( link_container )
 
 # check links
@@ -223,6 +258,7 @@
        filename = link_container['filename']
        line_number = link_container['line_number']
        link = link_container['link']
+       is_link_inside_comment_block = 
link_container['is_link_inside_comment_block']
 
        link_type = None
 
@@ -278,6 +314,9 @@
                already_checked_links.append( { 'link': link, 'error': 
link_error } )
                
        if link_error != None:
+               if is_link_inside_comment_block:
+                       link_error += ' (link commented out)'
+       
                if not SKIP_TRANSLATION_FILES:
                        match = re.search( TRANSLATION_REGEXP, filename )
                        if match and SEPARATE_TRANSLATION_REPORTS:

www/server/source/linc linc.py

Reply via email to