% -*- latex -*-
\documentclass{seminar}
\usepackage{lsalike}
\usepackage{longtable}
\usepackage{cmbright}
\usepackage{path}
\usepackage[all,ps,dvips]{xy}
\SelectTips{cm}{}
\CompileMatrices
\special{papersize=11in, 8.5in}
\newenvironment{indentLeft}
  {\begin{list}{}{\rightmargin 0pt\leftmargin 0.5in}\item[]}
  {\end{list}}
\newenvironment{indentLeftLess}
  {\begin{list}{}{\rightmargin 0pt\leftmargin 0.275in}\item[]}
  {\end{list}}
\def\pt{\phantom`}
\def\ps{\phantom*}
\def\hx{\hskip1ex\relax}
\def\gap{\vrule height0.4pt depth0pt width0.4cm}
\newcommand{\ip}[2]{#1 <\!\!< #2}

\slideframe{none}
\begin{document}
\begin{slide}\thispagestyle{empty}
\begin{center}
\begin{large}On the Automatic Construction of Grammars from the NEGRA
  Treebank\end{large}\\[12pt]
Mike Daniels\\
LING 795K\\
5 December 2002\\
\end{center}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Introduction\end{large}\end{center}
\begin{itemize}
\item Basic idea: Every treebank contains an implicit grammar. For
  every mother $A$ that dominates $B\ldots C$, we take the rule $A \to
  B \ldots C$.
\item This grammar is both large and redundant. We can therefore
  construct a similar grammar with the same coverage yet fewer rules
  and use this grammar to parse the sentences in the treebank.
\pagebreak
\item Initial goal: to develop a set of methods that yield
  the smallest grammar possible without sacrificing coverage.
\item On the other hand, early indications are that parsing efficiency
  is determined more by the ``branching factor'' of each grammar
  symbol, rather than the overall number of rules.
\item Revised goal: to develop a set of methods that simultaneously
  minimize the number of rules in the grammar as well as the branching
  factor of the grammar.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}NEGRA\end{large}\end{center}
\begin{itemize}
\item A fairly large treebank for German.
\item All nodes (except the root of a tree) are given both a
  part-of-speech tag (noun, verb, etc.) as well as a grammatical
  function label (subject, object, modifier, etc.). For our purposes,
  the tag-label pair was taken as an atomic symbol.
\item For this project, a subset of 10708 sentences was chosen: those
  that both:
\begin{itemize}
\item were rooted in the category S.
\item were devoid of coordination.
\end{itemize}
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Outline\end{large}\end{center}
\begin{enumerate}
\item Annotate the corpus with order information.
\item Untangle the annotated corpus.
\item Extract rules and remove duplicates.
\item Lift optionals from rules and remove duplicates.
\item Cluster the rules.
\item Extract LP information from clusters.
\item Turn the text rules into a prolog grammar.
\item Extract lexicon and remove duplicates.
\item Turn the lexicon into a prolog lexicon.
\item Generate a set of test sentences.
\item Turn the test sentences into prolog queries.
\end{enumerate}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Technology\end{large}\end{center}
\begin{itemize}
\item XSL interpreter (XML manipulation)
\item Perl (text processing)
\item sort
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Order Annotation\end{large}\end{center}
\begin{itemize}
\item All tree nodes in the NEGRA corpus are assigned an ID number.
  Leaves start at 1 and increase left-to-right, while interior nodes
  start at 500 and increase left-to-right, bottom-to-top. Thus no node
  refers to something that follows it.
\item The daughters of an interior node are listed in order of ID
  number.
\item Thus the information about the linear order of the daughters of an
  interior node is obscured and must be recovered.
\item We first mark each terminal with its left edge (i.e. position),
  and then each nonterminal with its left edge (the minimum of all
  daughters' left edges).
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Untangling\end{large}\end{center}
\begin{itemize}
\item While the part-of-speech tags are stored on the nodes
  themselves, the labels are stored on the mother nodes. Since we want
  to use the tag and label together as a category, they must be
  brought together.
\item This is also the point where the set of trees is turned into a
  set of rules. For each rule, its right-hand side is sorted according
  to the left edge information added in the last step.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Lifting Optional Elements\end{large}\end{center}
\begin{itemize}
\item Much of the redundancy in the initial grammar arises from
  optional elements.
\item That is, if there are five categories that could modify a given
  head, there are likely to be rules in the grammar in which each of
  the five occurs once with the head, each of the five occurs multiply
  with the head, or multiple categories appear multiply with the head.
\item We can therefore reduce the number of rules in the grammar by
  lifting the optional categories into a separate rule.
\item Currently, we go from 12367 rules to 5205 rules in this step.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Lifting Example\end{large}\end{center}
\begin{itemize}
\item This grammar:
  \begin{itemize}
  \item A:OB $\to$ B:OB C:OB D:OB\linebreak
  A:OB $\to$ E:OP F:OB G:OB\linebreak
  A:OB $\to$ H:OP I:OB J:OP
  \end{itemize}

becomes this grammar after lifting:

  \begin{itemize}
  \item A:OB $\to$ B:OB C:OB D:OB\linebreak
  A:OB $\to$ F:OB G:OB\linebreak
  A:OB $\to$ A:OB E:OP\linebreak
  A:OB $\to$ I:OB\linebreak
  A:OB $\to$ A:OB H:OP\linebreak
  A:OB $\to$ A:OB J:OP
  \end{itemize}
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Order Generalization\end{large}\end{center}
\begin{itemize}
\item So far, each of the rules from the grammar has a
  strictly-ordered right-hand side. Since this is to be a grammar for
  a linearization parser, we can conflate rules that have the same
  constituents in multiple orders.
\item For example, the rules:
\begin{itemize}
\item A $\to$ B D C\linebreak
A $\to$ B C D
\end{itemize}
\item would become
\begin{itemize}
\item A $\to$ B C D; $0 < 1$, $0 < 2$
\end{itemize}
\item This step takes us from 5205 rules to 4667 rules.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}G12N Algorithm\end{large}\end{center}
\begin{itemize}
\item We first cluster the rules: that is, we store each rule in a
  hash table indexed by the sorted right-hand side and then dump each
  bucket in the hash table.
\pagebreak
\item For example,
\begin{itemize}
\item A $\to$ B C\linebreak
A $\to$ B D\linebreak
A $\to$ B C D\linebreak
A $\to$ B D C\linebreak
A $\to$ C B\linebreak
B $\to$ B E\linebreak
B $\to$ B E C D
\end{itemize}
becomes
\begin{itemize}
\item A $\to$ B C\linebreak
A $\to$ C B
\vskip2pt\hrule width2in\vskip2pt
A $\to$ B D
\vskip2pt\hrule width2in\vskip2pt
A $\to$ B C D\linebreak
A $\to$ B D C
\vskip2pt\hrule width2in\vskip2pt
B $\to$ B E
\vskip2pt\hrule width2in\vskip2pt
B $\to$ B E C D
\end{itemize}
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}G12N Algorithm 2\end{large}\end{center}
\begin{itemize}
\item Now, for each cluster, we generate the set of order constraints
  for each rule in that cluster, intersect the sets so generated, and
  output the result.
\item Special handling is required when categories are duplicated
  within the rule.
\item From the last example:
\begin{itemize}
\item A $\to$ B C\linebreak
A $\to$ B D; 0 $<$ 1\linebreak
A $\to$ B C D; 0 $<$ 1, 0 $<$ 2\linebreak
B $\to$ B E; 0 $<$ 1\linebreak
B $\to$ B C D E; 3 $<$ 1, 3 $<$ 2, 0 $<$ 1, 0 $<$ 2, 0 $<$ 3, 1 $<$ 2\linebreak
A $\to$ B B B C; 0 $<$ 1, 1 $<$ 2, 0 $<$ 2, 0 $<$ 3
\end{itemize}
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Current status\end{large}\end{center}
\begin{itemize}
\item The final grammar of 4667 rules is still too large to be
  comfortably parsed by my parser.
\item Indications seem to be that this is the result of the branching
  factor of the grammar. For example, 171 rules expand the NN:OA symbol.
\item Future work will concentrate on reducing this branching factor.
  Two potential avenues:
\begin{itemize}
\item Introducing more categories in the lifting process.
\item Giving the parser a better search strategy: find lexical
  categories before phrasal categories.
\end{itemize}
\end{itemize}
\end{slide}
\end{document}
