InvenTree/InvenTree/common/files.py

"""
Files management tools.
"""

from rapidfuzz import fuzz
import tablib
import os

from django.utils.translation import gettext_lazy as _
from django.core.exceptions import ValidationError

# from company.models import ManufacturerPart, SupplierPart


class FileManager:
    """ Class for managing an uploaded file """

    name = ''

    # Fields which are absolutely necessary for valid upload
    REQUIRED_HEADERS = [
        'Quantity'
    ]

    # Fields which are used for part matching (only one of them is needed)
    PART_MATCH_HEADERS = [
        'Part_Name',
        'Part_IPN',
        'Part_ID',
    ]

    # Fields which would be helpful but are not required
    OPTIONAL_HEADERS = [
    ]

    EDITABLE_HEADERS = [
    ]

    HEADERS = REQUIRED_HEADERS + PART_MATCH_HEADERS + OPTIONAL_HEADERS

    def __init__(self, file, name=None):
        """ Initialize the FileManager class with a user-uploaded file object """

        # Set name
        if name:
            self.name = name

        # Process initial file
        self.process(file)

    def process(self, file):
        """ Process file """

        self.data = None

        ext = os.path.splitext(file.name)[-1].lower()

        if ext in ['.csv', '.tsv', ]:
            # These file formats need string decoding
            raw_data = file.read().decode('utf-8')
        elif ext in ['.xls', '.xlsx']:
            raw_data = file.read()
        else:
            raise ValidationError(_(f'Unsupported file format: {ext}'))

        try:
            self.data = tablib.Dataset().load(raw_data)
        except tablib.UnsupportedFormat:
            raise ValidationError(_(f'Error reading {self.name} file (invalid format)'))
        except tablib.core.InvalidDimensions:
            raise ValidationError(_(f'Error reading {self.name} file (incorrect dimension)'))

    def guess_header(self, header, threshold=80):
        """ Try to match a header (from the file) to a list of known headers

        Args:
            header - Header name to look for
            threshold - Match threshold for fuzzy search
        """

        # Try for an exact match
        for h in self.HEADERS:
            if h == header:
                return h

        # Try for a case-insensitive match
        for h in self.HEADERS:
            if h.lower() == header.lower():
                return h

        # Try for a case-insensitive match with space replacement
        for h in self.HEADERS:
            if h.lower() == header.lower().replace(' ', '_'):
                return h

        # Finally, look for a close match using fuzzy matching
        matches = []

        for h in self.HEADERS:
            ratio = fuzz.partial_ratio(header, h)
            if ratio > threshold:
                matches.append({'header': h, 'match': ratio})

        if len(matches) > 0:
            matches = sorted(matches, key=lambda item: item['match'], reverse=True)
            return matches[0]['header']

        return None

    def columns(self):
        """ Return a list of headers for the thingy """
        headers = []

        for header in self.data.headers:
            headers.append({
                'name': header,
                'guess': self.guess_header(header)
            })

        return headers

    def col_count(self):
        if self.data is None:
            return 0

        return len(self.data.headers)

    def row_count(self):
        """ Return the number of rows in the file. """

        if self.data is None:
            return 0

        return len(self.data)

    def rows(self):
        """ Return a list of all rows """
        rows = []

        for i in range(self.row_count()):

            data = [item for item in self.get_row_data(i)]

            # Is the row completely empty? Skip!
            empty = True

            for idx, item in enumerate(data):
                if len(str(item).strip()) > 0:
                    empty = False

                try:
                    # Excel import casts number-looking-items into floats, which is annoying
                    if item == int(item) and not str(item) == str(int(item)):
                        data[idx] = int(item)
                except ValueError:
                    pass

            # Skip empty rows
            if empty:
                continue

            row = {
                'data': data,
                'index': i
            }

            rows.append(row)

        return rows

    def get_row_data(self, index):
        """ Retrieve row data at a particular index """
        if self.data is None or index >= len(self.data):
            return None

        return self.data[index]

    def get_row_dict(self, index):
        """ Retrieve a dict object representing the data row at a particular offset """

        if self.data is None or index >= len(self.data):
            return None

        return self.data.dict[index]