comparison pylons_app/lib/indexers/daemon.py @ 474:28f19fa562df

updated config files, Implemented content index extensions with whoosh, fixed analyzer to match more words
author Marcin Kuzminski <marcin@python-works.com>
date Sat, 28 Aug 2010 14:53:32 +0200
parents 9b67cebe6609
children c59c4d4323e7
comparison
equal deleted inserted replaced
473:0e8ef6f17203 474:28f19fa562df
36 import traceback 36 import traceback
37 from pylons_app.config.environment import load_environment 37 from pylons_app.config.environment import load_environment
38 from pylons_app.model.hg_model import HgModel 38 from pylons_app.model.hg_model import HgModel
39 from whoosh.index import create_in, open_dir 39 from whoosh.index import create_in, open_dir
40 from shutil import rmtree 40 from shutil import rmtree
41 from pylons_app.lib.indexers import ANALYZER, EXCLUDE_EXTENSIONS, IDX_LOCATION, \ 41 from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
42 SCHEMA, IDX_NAME 42 SCHEMA, IDX_NAME
43 43
44 import logging 44 import logging
45 import logging.config 45 import logging.config
46 logging.config.fileConfig(jn(project_path, 'development.ini')) 46 logging.config.fileConfig(jn(project_path, 'development.ini'))
68 return index_paths_ 68 return index_paths_
69 69
70 def add_doc(self, writer, path, repo): 70 def add_doc(self, writer, path, repo):
71 """Adding doc to writer""" 71 """Adding doc to writer"""
72 72
73 #we don't won't to read excluded file extensions just index them 73 ext = unicode(path.split('/')[-1].split('.')[-1].lower())
74 if path.split('/')[-1].split('.')[-1].lower() not in EXCLUDE_EXTENSIONS: 74 #we just index the content of choosen files
75 if ext in INDEX_EXTENSIONS:
76 log.debug(' >> %s [WITH CONTENT]' % path)
75 fobj = open(path, 'rb') 77 fobj = open(path, 'rb')
76 content = fobj.read() 78 content = fobj.read()
77 fobj.close() 79 fobj.close()
78 try: 80 try:
79 u_content = unicode(content) 81 u_content = unicode(content)
80 except UnicodeDecodeError: 82 except UnicodeDecodeError:
81 #incase we have a decode error just represent as byte string 83 #incase we have a decode error just represent as byte string
82 u_content = unicode(str(content).encode('string_escape')) 84 u_content = unicode(str(content).encode('string_escape'))
83 else: 85 else:
84 u_content = u'' 86 log.debug(' >> %s' % path)
87 #just index file name without it's content
88 u_content = u''
89
85 writer.add_document(owner=unicode(repo.contact), 90 writer.add_document(owner=unicode(repo.contact),
86 repository=u"%s" % repo.name, 91 repository=u"%s" % repo.name,
87 path=u"%s" % path, 92 path=u"%s" % path,
88 content=u_content, 93 content=u_content,
89 modtime=os.path.getmtime(path)) 94 modtime=os.path.getmtime(path),
95 extension=ext)
90 96
91 def build_index(self): 97 def build_index(self):
92 if os.path.exists(IDX_LOCATION): 98 if os.path.exists(IDX_LOCATION):
99 log.debug('removing previos index')
93 rmtree(IDX_LOCATION) 100 rmtree(IDX_LOCATION)
94 101
95 if not os.path.exists(IDX_LOCATION): 102 if not os.path.exists(IDX_LOCATION):
96 os.mkdir(IDX_LOCATION) 103 os.mkdir(IDX_LOCATION)
97 104
100 107
101 for cnt, repo in enumerate(scan_paths(self.repo_location).values()): 108 for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
102 log.debug('building index @ %s' % repo.path) 109 log.debug('building index @ %s' % repo.path)
103 110
104 for idx_path in self.get_paths(repo.path): 111 for idx_path in self.get_paths(repo.path):
105 log.debug(' >> %s' % idx_path)
106 self.add_doc(writer, idx_path, repo) 112 self.add_doc(writer, idx_path, repo)
107 writer.commit(merge=True) 113 writer.commit(merge=True)
108 114
109 log.debug('>>> FINISHED BUILDING INDEX <<<') 115 log.debug('>>> FINISHED BUILDING INDEX <<<')
110 116
168 self.build_index() 174 self.build_index()
169 else: 175 else:
170 self.update_index() 176 self.update_index()
171 177
172 if __name__ == "__main__": 178 if __name__ == "__main__":
173 repo_location = '/home/marcink/python_workspace_dirty/*' 179 repo_location = '/home/marcink/hg_repos/*'
174 180 full_index = True # False means looking just for changes
175 try: 181 try:
176 l = DaemonLock() 182 l = DaemonLock()
177 WhooshIndexingDaemon(repo_location=repo_location).run(full_index=True) 183 WhooshIndexingDaemon(repo_location=repo_location)\
184 .run(full_index=full_index)
178 l.release() 185 l.release()
179 except LockHeld: 186 except LockHeld:
180 sys.exit(1) 187 sys.exit(1)
181 188