changeset 5983:9bd355d893f4

cfg.url_prefix_action_check = True to strictly check whether action URLs look like we generate them url_prefix_action was introduced to be able to use robots.txt against bots doing unwanted requests for all sorts of actions, just because they found a link to them in the menu_bar (or somewhere else). Problem: even if url_prefix_action was in use (e.g. == 'action'), we still accepted URls without that prefix (e.g. cached by some bot or search engine from earlier times or external links). Answering those requests can take a lot of CPU time and we can't get rid of them without doing the check that was implemented with this changeset.
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Sun, 07 Apr 2013 00:59:26 +0200
parents 897111701cf8
children e7136d5731df
files MoinMoin/config/multiconfig.py MoinMoin/wsgiapp.py docs/CHANGES
diffstat 3 files changed, 26 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/config/multiconfig.py	Sat Apr 06 23:26:08 2013 +0200
+++ b/MoinMoin/config/multiconfig.py	Sun Apr 07 00:59:26 2013 +0200
@@ -1032,6 +1032,8 @@
 
     ('url_prefix_action', None,
      "Use 'action' to enable action URL generation to be compatible with robots.txt. It will generate .../action/info/PageName?action=info then. Recommended for internet wikis."),
+    ('url_prefix_action_check', False,
+     "Do a strict check whether the URL pathes for actions look like we generate them (otherwise do a 404 response)."),
 
     ('notification_bot_uri', None, "URI of the Jabber notification bot."),
 
--- a/MoinMoin/wsgiapp.py	Sat Apr 06 23:26:08 2013 +0200
+++ b/MoinMoin/wsgiapp.py	Sun Apr 07 00:59:26 2013 +0200
@@ -12,7 +12,7 @@
 logging = log.getLogger(__name__)
 
 from MoinMoin.web.contexts import AllContext, Context, XMLRPCContext
-from MoinMoin.web.exceptions import HTTPException
+from MoinMoin.web.exceptions import HTTPException, abort
 from MoinMoin.web.request import Request, MoinMoinFinish, HeaderSet
 from MoinMoin.web.utils import check_forbidden, check_surge_protect, fatal_response, \
     redirect_last_visited
@@ -94,8 +94,12 @@
         context.finish()
         context.clock.stop('run')
 
-def remove_prefix(path, prefix=None):
-    """ Remove an url prefix from the path info and return shortened path. """
+def remove_prefix(path, action_name, prefix, check_prefix):
+    """
+    Remove an url prefix from the path info and return shortened path.
+    
+    If check_prefix is True, we do some consistency checks and 404 invalid URLs.
+    """
     # we can have all action URLs like this: /action/ActionName/PageName?action=ActionName&...
     # this is just for robots.txt being able to forbid them for crawlers
     if prefix is not None:
@@ -105,13 +109,21 @@
             path = path[len(prefix):]
             action, path = (path.split('/', 1) + ['', ''])[:2]
             path = '/' + path
+            if check_prefix and action != action_name:
+                # inconsistency found (action in querystr != action in path)
+                abort(404)
+        elif check_prefix and action_name != 'show':
+            # invalid: a non-default (non-show) action, but the prefix is not present
+            abort(404)
     return path
 
 def dispatch(request, context, action_name='show'):
     cfg = context.cfg
 
     # The last component in path_info is the page name, if any
-    path = remove_prefix(request.path, cfg.url_prefix_action)
+    path = remove_prefix(request.path, action_name,
+                         cfg.url_prefix_action,
+                         cfg.url_prefix_action_check)
 
     if path.startswith('/'):
         pagename = wikiutil.normalize_pagename(path, cfg)
--- a/docs/CHANGES	Sat Apr 06 23:26:08 2013 +0200
+++ b/docs/CHANGES	Sun Apr 07 00:59:26 2013 +0200
@@ -34,6 +34,14 @@
     Output encoding is utf-8, columns are in this order:
     time, event, username, ip, wikiname, pagename, url, referrer, ua
     time: UNIX timestamp (float)
+  * Added strict checking for action URLs to avoid load caused by bots:
+      url_prefix_action = 'action'
+      url_prefix_action_check = True  # New, default is False
+    Note: action URLs generated without these settings will not work any
+    longer and (with url_prefix_action_check = True) will get rejected by
+    MoinMoin with a 404 "Not Found" response.
+    To disallow actions for the bots, add this to /robots.txt:
+      Disallow: /action/
 
 
 Version 1.9.7: