12e9a9721c330a45c6c42f975245bb491d14c137
[releng-anteater.git] / anteater / src / project_scan.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 ##############################################################################
4 # Copyright (c) 2017 Luke Hinds <lhinds@redhat.com>, Red Hat
5 #
6 # All rights reserved. This program and the accompanying materials
7 # are made available under the terms of the Apache License, Version 2.0
8 # which accompanies this distribution, and is available at
9 # http://www.apache.org/licenses/LICENSE-2.0
10 ##############################################################################
11
12 """
13     Accepts the --path argument and iterates the root directory using os.walk
14     If a file is a binary, or contains a blacklisted string. If any violations
15     are found, the script adds the violation to a log file.
16 """
17
18 from __future__ import division, print_function, absolute_import
19 import hashlib
20 import six.moves.configparser
21 import os
22 import re
23 import logging
24 from binaryornot.check import is_binary
25
26 from . import get_lists
27
28 logger = logging.getLogger(__name__)
29 config = six.moves.configparser.RawConfigParser()
30 config.read('anteater.conf')
31 reports_dir = config.get('config', 'reports_dir')
32 master_list = config.get('config', 'master_list')
33 ignore_dirs = ['.git']
34 hasher = hashlib.sha256()
35
36
37 def prepare_project(project, project_dir):
38     """ Generates blacklists / whitelists and calls main functions """
39
40     # Get Various Lists / Project Waivers
41     lists = get_lists.GetLists()
42
43     # Get binary white list
44     binary_list = lists.binary_list(project)
45
46     # Get file name black list and project waivers
47     file_audit_list, file_audit_project_list = lists.file_audit_list(project)
48
49     # Get file content black list and project waivers
50     master_list, project_list = lists.file_content_list(project)
51
52     # Get File Ignore Lists
53     file_ignore = lists.file_ignore()
54
55     # Get Licence Lists
56     licence_ext = lists.licence_extensions()
57     licence_ignore = lists.licence_ignore()
58
59     # Perform rudimentary scans
60     scan_file(project_dir, project, binary_list,file_audit_list,
61               file_audit_project_list, master_list, file_ignore,
62               project_list)
63
64     # Perform licence header checks
65     licence_check(licence_ext, licence_ignore, project, project_dir)
66     licence_root_check(project_dir, project)
67
68
69 def scan_file(project_dir, project, binary_list, file_audit_list,
70               file_audit_project_list, master_list, file_ignore,
71               project_list):
72     """Searches for banned strings and files that are listed """
73     for root, dirs, files in os.walk(project_dir):
74         # Filter out ignored directories from list.
75         dirs[:] = [d for d in dirs if d not in ignore_dirs]
76         for items in files:
77             full_path = os.path.join(root, items)
78             # Check for Blacklisted file names
79             if file_audit_list.search(full_path) and not \
80                     file_audit_project_list.search(full_path):
81                 match = file_audit_list.search(full_path)
82                 logger.error('Blacklisted filename: %s', full_path)
83                 logger.error('Matched String: %s', match.group())
84                 with open(reports_dir + "file-names_" + project + ".log",
85                           "a") as gate_report:
86                             gate_report. \
87                                 write('Blacklisted filename: {0}\n'.
88                                       format(full_path))
89                             gate_report. \
90                                 write('Matched String: {0}'.
91                                       format(match.group()))
92
93                             # Check if Binary is whitelisted
94             hashlist = get_lists.GetLists()
95             binary_hash = hashlist.binary_hash(project, full_path)
96             if is_binary(full_path) and not binary_list.search(full_path):
97                 with open(full_path, 'rb') as afile:
98                     buf = afile.read()
99                     hasher.update(buf)
100                 if hasher.hexdigest() in binary_hash:
101                     logger.info('Found matching file hash for file: %s',
102                                     full_path)
103                 else:
104                     logger.error('Non Whitelisted Binary file: %s',
105                                  full_path)
106                     logger.error('Please submit patch with this hash: %s',
107                                  hasher.hexdigest())
108                     with open(reports_dir + "binaries-" + project + ".log",
109                               "a") as gate_report:
110                             gate_report.write('Non Whitelisted Binary: {0}\n'.
111                                               format(full_path))
112
113             else:
114                 if not items.endswith(tuple(file_ignore)):
115                     try:
116                         fo = open(full_path, 'r')
117                         lines = fo.readlines()
118                     except IOError:
119                         logger.error('%s does not exist', full_path)
120
121                     for line in lines:
122                         # Check for sensitive content in project files
123                         for key, value in master_list.iteritems():
124                             regex = value['regex']
125                             desc = value['desc']
126                             if re.search(regex, line) and not re.search(
127                                     project_list, line):
128                                 logger.error('File contains violation: %s',
129                                              full_path)
130                                 logger.error('Flagged Content: %s',
131                                              line.rstrip())
132                                 logger.error('Matched Regular Exp: %s', regex)
133                                 logger.error('Rationale: %s', desc.rstrip())
134                                 with open(reports_dir + "contents-" + project
135                                                   + ".log", "a") \
136                                         as gate_report:
137                                     gate_report. \
138                                         write('File contains violation: {0}\n'.
139                                               format(full_path))
140                                     gate_report. \
141                                         write('Flagged Content: {0}'.
142                                               format(line))
143                                     gate_report. \
144                                         write('Matched Regular Exp: {0}'.
145                                               format(regex))
146                                     gate_report. \
147                                         write('Rationale: {0}\n'.
148                                               format(desc.rstrip()))
149
150
151
152 def licence_root_check(project_dir, project):
153     if os.path.isfile(project_dir + '/LICENSE'):
154         logger.info('LICENSE file present in: %s', project_dir)
155     else:
156         logger.error('LICENSE file missing in: %s', project_dir)
157         with open(reports_dir + "licence-" + project + ".log",
158                   "a") \
159                 as gate_report:
160             gate_report.write('LICENSE file missing in: {0}\n'.
161                               format(project_dir))
162
163
164 def licence_check(licence_ext, licence_ignore, project, project_dir):
165     """ Peform basic checks for the presence of licence strings """
166     for root, dirs, files in os.walk(project_dir):
167         dirs[:] = [d for d in dirs if d not in ignore_dirs]
168         for file in files:
169             if file.endswith(tuple(licence_ext)) \
170                     and file not in licence_ignore:
171                 full_path = os.path.join(root, file)
172                 if not is_binary(full_path):
173                     fo = open(full_path, 'r')
174                     content = fo.read()
175                     # Note: Hardcoded use of 'copyright' & 'spdx' is the result
176                     # of a decision made at 2017 plugfest to limit searches to
177                     # just these two strings.
178                     patterns = ['copyright', 'spdx',
179                                 'http://creativecommons.org/licenses/by/4.0']
180                     if any(i in content.lower() for i in patterns):
181                         logger.info('Licence string present: %s', full_path)
182                     else:
183                         logger.error('Licence header missing: %s', full_path)
184                         with open(reports_dir + "licence-" + project + ".log",
185                                   "a") \
186                                 as gate_report:
187                             gate_report.write('Licence header missing: {0}\n'.
188                                               format(full_path))