3a04a8359b60c3917411551762122f42f192b42a
[releng-anteater.git] / anteater / src / project_scan.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 ##############################################################################
4 # Copyright (c) 2017 Luke Hinds <lhinds@redhat.com>, Red Hat
5 #
6 # All rights reserved. This program and the accompanying materials
7 # are made available under the terms of the Apache License, Version 2.0
8 # which accompanies this distribution, and is available at
9 # http://www.apache.org/licenses/LICENSE-2.0
10 ##############################################################################
11
12 """
13     Accepts the --path argument and iterates the root directory using os.walk
14     If a file is a binary, or contains a blacklisted string. If any violations
15     are found, the script adds the violation to a log file.
16 """
17
18 from __future__ import division, print_function, absolute_import
19 import hashlib
20 import six.moves.configparser
21 import os
22 import re
23 import logging
24 from binaryornot.check import is_binary
25
26 from . import get_lists
27
28 logger = logging.getLogger(__name__)
29 config = six.moves.configparser.RawConfigParser()
30 config.read('anteater.conf')
31 reports_dir = config.get('config', 'reports_dir')
32 master_list = config.get('config', 'master_list')
33 ignore_list = config.get('config', 'master_list')
34 ignore_dirs = ['.git']
35 hasher = hashlib.sha256()
36
37
38 def prepare_project(project, project_dir):
39     """ Generates blacklists / whitelists and calls main functions """
40
41     # Get Various Lists / Project Waivers
42     lists = get_lists.GetLists()
43
44     # Get binary white list
45     binary_list = lists.binary_list(project)
46
47     # Get file name black list and project waivers
48     file_audit_list, file_audit_project_list = lists.file_audit_list(project)
49
50     # Get file content black list and project waivers
51     master_list, ignore_list = lists.file_content_list(project)
52
53     # Get File Ignore Lists
54     file_ignore = lists.file_ignore()
55
56     # Get Licence Lists
57     licence_ext = lists.licence_extensions()
58     licence_ignore = lists.licence_ignore()
59
60     # Perform rudimentary scans
61     scan_file(project_dir, project, binary_list,file_audit_list,
62               file_audit_project_list, master_list, ignore_list,
63               file_ignore)
64
65     # Perform licence header checks
66     licence_check(licence_ext, licence_ignore, project, project_dir)
67     licence_root_check(project_dir, project)
68
69
70 def scan_file(project_dir, project, binary_list, file_audit_list,
71               file_audit_project_list, master_list, ignore_list,
72               file_ignore):
73     """Searches for banned strings and files that are listed """
74     for root, dirs, files in os.walk(project_dir):
75         # Filter out ignored directories from list.
76         dirs[:] = [d for d in dirs if d not in ignore_dirs]
77         for items in files:
78             full_path = os.path.join(root, items)
79             # Check for Blacklisted file names
80             if file_audit_list.search(full_path) and not \
81                     file_audit_project_list.search(full_path):
82                 match = file_audit_list.search(full_path)
83                 logger.error('Blacklisted filename: %s', full_path)
84                 logger.error('Matched String: %s', match.group())
85                 with open(reports_dir + "file-names_" + project + ".log",
86                           "a") as gate_report:
87                             gate_report. \
88                                 write('Blacklisted filename: {0}\n'.
89                                       format(full_path))
90                             gate_report. \
91                                 write('Matched String: {0}'.
92                                       format(match.group()))
93
94             # Check if Binary is whitelisted
95             hashlist = get_lists.GetLists()
96             binary_hash = hashlist.binary_hash(project, full_path)
97
98             if is_binary(full_path) and not binary_list.search(full_path):
99                 with open(full_path, 'rb') as afile:
100                     buf = afile.read()
101                     hasher.update(buf)
102                 if hasher.hexdigest() in binary_hash:
103                     logger.info('Found matching file hash for file: %s',
104                                     full_path)
105                 else:
106                     logger.error('Non Whitelisted Binary file: %s',
107                                  full_path)
108                     logger.error('Please submit patch with this hash: %s',
109                                  hasher.hexdigest())
110                     with open(reports_dir + "binaries-" + project + ".log",
111                               "a") as gate_report:
112                             gate_report.write('Non Whitelisted Binary: {0}\n'.
113                                               format(full_path))
114                             gate_report.write(
115                                 'Submit patch with the following hash: {0}\n'.
116                                 format(hasher.hexdigest()))
117
118             else:
119                 if not items.endswith(tuple(file_ignore)):
120                     try:
121                         fo = open(full_path, 'r')
122                         lines = fo.readlines()
123                     except IOError:
124                         logger.error('%s does not exist', full_path)
125
126                     for line in lines:
127                         # Check for sensitive content in project files
128                         for key, value in master_list.iteritems():
129                             regex = value['regex']
130                             desc = value['desc']
131                             if re.search(regex, line) and not re.search(
132                                     ignore_list, line):
133                                 logger.error('File contains violation: %s',
134                                              full_path)
135                                 logger.error('Flagged Content: %s',
136                                              line.rstrip())
137                                 logger.error('Matched Regular Exp: %s', regex)
138                                 logger.error('Rationale: %s', desc.rstrip())
139                                 with open(reports_dir + "contents-" + project
140                                                   + ".log", "a") \
141                                         as gate_report:
142                                     gate_report. \
143                                         write('File contains violation: {0}\n'.
144                                               format(full_path))
145                                     gate_report. \
146                                         write('Flagged Content: {0}'.
147                                               format(line))
148                                     gate_report. \
149                                         write('Matched Regular Exp: {0}'.
150                                               format(regex))
151                                     gate_report. \
152                                         write('Rationale: {0}\n'.
153                                               format(desc.rstrip()))
154
155
156
157 def licence_root_check(project_dir, project):
158     if os.path.isfile(project_dir + '/LICENSE'):
159         logger.info('LICENSE file present in: %s', project_dir)
160     else:
161         logger.error('LICENSE file missing in: %s', project_dir)
162         with open(reports_dir + "licence-" + project + ".log",
163                   "a") \
164                 as gate_report:
165             gate_report.write('LICENSE file missing in: {0}\n'.
166                               format(project_dir))
167
168
169 def licence_check(licence_ext, licence_ignore, project, project_dir):
170     """ Peform basic checks for the presence of licence strings """
171     for root, dirs, files in os.walk(project_dir):
172         dirs[:] = [d for d in dirs if d not in ignore_dirs]
173         for file in files:
174             if file.endswith(tuple(licence_ext)) \
175                     and file not in licence_ignore:
176                 full_path = os.path.join(root, file)
177                 if not is_binary(full_path):
178                     fo = open(full_path, 'r')
179                     content = fo.read()
180                     # Note: Hardcoded use of 'copyright' & 'spdx' is the result
181                     # of a decision made at 2017 plugfest to limit searches to
182                     # just these two strings.
183                     patterns = ['copyright', 'spdx',
184                                 'http://creativecommons.org/licenses/by/4.0']
185                     if any(i in content.lower() for i in patterns):
186                         logger.info('Licence string present: %s', full_path)
187                     else:
188                         logger.error('Licence header missing: %s', full_path)
189                         with open(reports_dir + "licence-" + project + ".log",
190                                   "a") \
191                                 as gate_report:
192                             gate_report.write('Licence header missing: {0}\n'.
193                                               format(full_path))