//
// $Id: MIMEType.cc,v 1.1 2005/02/25 15:31:46 anonymous Exp $
// 
// basic MIME type detection
#include <stdio.h>
#include <string.h>
#include "DocFilters.h"
#include "CharsetConvert.h"

/*#include "portable.h"
#include "DocFilters.h"
#include "debuglevel.h"

#include "CharsetDetect.h"
#ifdef CHARSET_DETECT
#include "lang.h"
#include "globals.h"
#endif


#include "Options.h"

#if defined(APP_LanguageID) || defined(APP_SysTools)
#define ISOlang(x) "??"
#endif

#include "filters.h"
*/
// Bytes  Encoding Form
// 00 00 FE FF    UTF-32, big-endian
// FF FE 00 00    UTF-32, little-endian
// FE FF          UTF-16, big-endian
// FF FE          UTF-16, little-endian
// EF BB BF       UTF-8

// Check signature of buffer to detect BOM (unicode signature)
// return a UTF-8 equivalent string, without BOM
//
// UTF-32 is not yet recognized
//
std::string CheckUnicodeSignature(DocumentPtr D) {
  size_t s=D->buffer.size();
  int bom_size=0;
  
  if (s>=3 && D->buffer[0]==0xef && D->buffer[1]==0xbb &&
                                    D->buffer[2]==0xbf) {
    D->mimeencoding="UTF-8";
    bom_size=3;
  } else if(s>=4 && D->buffer[0]==0x00 && D->buffer[1]==0x00
                 && D->buffer[2]==0xfe && D->buffer[3]==0xff) {
    D->mimeencoding="UTF-32BE";
    bom_size=4;
  } else if(s>=4 && D->buffer[0]==0xff && D->buffer[1]==0xfe
                 && D->buffer[2]==0x00 && D->buffer[3]==0x00) {
    D->mimeencoding="UTF-32LE";
    bom_size=4;
  } else if (s>=2 && D->buffer[0]==0xff && D->buffer[1]==0xfe)  {
    D->mimeencoding="UTF-16LE";
    bom_size=2;
  } else if (s>=2 && D->buffer[0]==0xfe && D->buffer[1]==0xff) {
    D->mimeencoding="UTF-16BE";
    bom_size=2;
  } else
    return "";

  if(D->mimeencoding=="UTF-8")
    return D->buffer.substr(bom_size);
      
  return DefaultCharsetConvertor(D->mimeencoding,"UTF-8").
    Convert(D->buffer.substr(bom_size));
}

// Check if file is HTML by looking for the first '<' ignoring empty characters
bool HasHTMLHeader(const char *buffer) {
  // Just check if it is not text tags 
  char *p=(char *)buffer+strspn(buffer, " \t\n\r");
  /* might also be some html entity here &nbsp; */
  while(strncmp(p,"&nbsp;",6)==0) p+=6;
  if (*p!='<')
    return false;
  p++;
  // Avoid Systran txt tags or xml declaration
  if (!strncasecmp(p,"systran",7) ||
      !strncasecmp(p,"dnt",3) ||
      (!strncmp(p,"?xml",4) && p==buffer+1))
    return false;

  return true;
}

bool HasXMLHeader(const char *buffer) {
  // We do not detect XML files without an XML declaration :-(
  if(!strncmp(buffer,"<?xml",5))
    return true;
  else
    return false;
}
/*
void DetectMIMEType(DocumentPtr ptrD)
{
  // Use user-defined charset, if any
  const char *input_charset = NULL;
  const char *header;
  ptrD->mimeencoding="";

  // For UNICODE filter, we first detect unicode header and in a second step
  // check whether it is text or html.
  // Note that html detection in the case of unicode document is not the same
  // that html detection not unicode.
  // A normalization is required here

  // Unicode

  std::string utf8_text=CheckUnicodeSignature(ptrD);

  if(!utf8_text.empty()) { // We have an UTF encoding
    if(!DocFilters3FiltersPtr->FindMIMEType(utf8_text,ptrD->mimetype))
      ptrD->mimetype="application/octet-stream";
    return;
  }
  
  if(DocFilters3FiltersPtr->FindMIMEType(ptrD->buffer,ptrD->mimetype)) {
    if(strncmp(ptrD->mimetype.c_str(),"text/",5) &&
        (ptrD->mimetype.size()<4 || 
         ptrD->mimetype.substr(ptrD->mimetype.size()-4)!="+xml"))
      return; // Does not need any charset detection
  } else {
    ptrD->mimetype="application/octet-stream";
    return;
  }

#ifdef CHARSET_DETECT
  if (input_charset && *input_charset && strcmp(input_charset,"auto")!=0)
#else
  if (input_charset && *input_charset)
#endif
  {
    const char *normalized_input_charset=(char*)FindISOCharset(input_charset);
    ptrD->mimeencoding=normalized_input_charset?normalized_input_charset:
                                                input_charset;
  }

#if defined(APP_LanguageID) || defined(APP_SysTools)
  if (ptrD->mimetype=="text/html")
    ptrD->mimeencoding=FindHTMLCharset(ptrD->buffer.c_str());
#endif
  
  if (ptrD->mimeencoding.empty())
#ifdef CHARSET_DETECT
  {
    ptrD->mimeencoding=DetectCharset(ptrD->buffer.data(),ptrD->buffer.size(),ISOlang(g_src_lang));
    if(!ptrD->mimeencoding.empty())
      print_debug("charset_detect",1,"Detected charset: %s\n",ptrD->mimeencoding.c_str());
  }
#else
    ptrD->mimeencoding="UTF-8";
#endif
}
*/
