// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: ucsdet.h * encoding: UTF-8 * indentation:4 * * created on: 2005Aug04 * created by: Andy Heninger * * ICU Character Set Detection, API for C * * Draft version 18 Oct 2005 * */ #ifndef __UCSDET_H #define __UCSDET_H #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "unicode/localpointer.h" #include "unicode/uenum.h" /** * \file * \brief C API: Charset Detection API * * This API provides a facility for detecting the * charset or encoding of character data in an unknown text format. * The input data can be from an array of bytes. *
* Character set detection is at best an imprecise operation. The detection * process will attempt to identify the charset that best matches the characteristics * of the byte data, but the process is partly statistical in nature, and * the results can not be guaranteed to always be correct. *
* For best accuracy in charset detection, the input data should be primarily * in a single language, and a minimum of a few hundred bytes worth of plain text * in the language are needed. The detection process will attempt to * ignore html or xml style markup that could otherwise obscure the content. *
* An alternative to the ICU Charset Detector is the * Compact Encoding Detector, https://github.com/google/compact_enc_det. * It often gives more accurate results, especially with short input samples. */ struct UCharsetDetector; /** * Structure representing a charset detector * @stable ICU 3.6 */ typedef struct UCharsetDetector UCharsetDetector; struct UCharsetMatch; /** * Opaque structure representing a match that was identified * from a charset detection operation. * @stable ICU 3.6 */ typedef struct UCharsetMatch UCharsetMatch; /** * Open a charset detector. * * @param status Any error conditions occurring during the open * operation are reported back in this variable. * @return the newly opened charset detector. * @stable ICU 3.6 */ U_STABLE UCharsetDetector * U_EXPORT2 ucsdet_open(UErrorCode *status); /** * Close a charset detector. All storage and any other resources * owned by this charset detector will be released. Failure to * close a charset detector when finished with it can result in * memory leaks in the application. * * @param ucsd The charset detector to be closed. * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ucsdet_close(UCharsetDetector *ucsd); #if U_SHOW_CPLUSPLUS_API U_NAMESPACE_BEGIN /** * \class LocalUCharsetDetectorPointer * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). * For most methods see the LocalPointerBase base class. * * @see LocalPointerBase * @see LocalPointer * @stable ICU 4.4 */ U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); U_NAMESPACE_END #endif /** * Set the input byte data whose charset is to detected. * * Ownership of the input text byte array remains with the caller. * The input string must not be altered or deleted until the charset * detector is either closed or reset to refer to different input text. * * @param ucsd the charset detector to be used. * @param textIn the input text of unknown encoding. . * @param len the length of the input text, or -1 if the text * is NUL terminated. * @param status any error conditions are reported back in this variable. * * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); /** Set the declared encoding for charset detection. * The declared encoding of an input text is an encoding obtained * by the user from an http header or xml declaration or similar source that * can be provided as an additional hint to the charset detector. * * How and whether the declared encoding will be used during the * detection process is TBD. * * @param ucsd the charset detector to be used. * @param encoding an encoding for the current data obtained from * a header or declaration or other source outside * of the byte data itself. * @param length the length of the encoding name, or -1 if the name string * is NUL terminated. * @param status any error conditions are reported back in this variable. * * @stable ICU 3.6 */ U_STABLE void U_EXPORT2 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); /** * Return the charset that best matches the supplied input data. * * Note though, that because the detection * only looks at the start of the input data, * there is a possibility that the returned charset will fail to handle * the full set of input data. *
* The returned UCharsetMatch object is owned by the UCharsetDetector. * It will remain valid until the detector input is reset, or until * the detector is closed. *
* The function will fail if *
* The returned UCharsetMatch objects are owned by the UCharsetDetector. * They will remain valid until the detector is closed or modified * *
* Return an error if *
* The state of the Charset detector that is passed in does not * affect the result of this function, but requiring a valid, open * charset detector as a parameter insures that the charset detection * service has been safely initialized and that the required detection * data is available. * *
* Note: Multiple different charset encodings in a same family may use
* a single shared name in this implementation. For example, this method returns
* an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
* (Windows Latin 1). However, actual detection result could be "windows-1252"
* when the input data matches Latin 1 code points with any points only available
* in "windows-1252".
*
* @param ucsd a Charset detector.
* @param status Any error conditions are reported back in this variable.
* @return an iterator providing access to the detectable charset names.
* @stable ICU 3.6
*/
U_STABLE UEnumeration * U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
/**
* Test whether input filtering is enabled for this charset detector.
* Input filtering removes text that appears to be HTML or xml
* markup from the input before applying the code page detection
* heuristics.
*
* @param ucsd The charset detector to check.
* @return TRUE if filtering is enabled.
* @stable ICU 3.6
*/
U_STABLE UBool U_EXPORT2
ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
/**
* Enable filtering of input text. If filtering is enabled,
* text within angle brackets ("<" and ">") will be removed
* before detection, which will remove most HTML or xml markup.
*
* @param ucsd the charset detector to be modified.
* @param filter true
to enable input text filtering.
* @return The previous setting.
*
* @stable ICU 3.6
*/
U_STABLE UBool U_EXPORT2
ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
#ifndef U_HIDE_INTERNAL_API
/**
* Get an iterator over the set of detectable charsets -
* over the charsets that are enabled by the specified charset detector.
*
* The returned UEnumeration provides access to the names of
* the charsets.
*
* @param ucsd a Charset detector.
* @param status Any error conditions are reported back in this variable.
* @return an iterator providing access to the detectable charset names by
* the specified charset detector.
* @internal
*/
U_INTERNAL UEnumeration * U_EXPORT2
ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
/**
* Enable or disable individual charset encoding.
* A name of charset encoding must be included in the names returned by
* {@link #ucsdet_getAllDetectableCharsets()}.
*
* @param ucsd a Charset detector.
* @param encoding encoding the name of charset encoding.
* @param enabled TRUE
to enable, or FALSE
to disable the
* charset encoding.
* @param status receives the return status. When the name of charset encoding
* is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
* @internal
*/
U_INTERNAL void U_EXPORT2
ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
#endif /* U_HIDE_INTERNAL_API */
#endif
#endif /* __UCSDET_H */